Megatest

Check-in [2f8513eec5]
Login
Overview
Comment:Changed sleep in kind-run from 2 to 18 seconds to allow server to come up to the point where get-list will find it. Cleaned up messages and comments.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65
Files: files | file ages | folders
SHA1: 2f8513eec5ac5e7583c5213a30f2323014421bd2
User & Date: mmgraham on 2021-03-25 11:26:01
Other Links: branch diff | manifest | tags
Context
2021-03-25
17:35
changed server start lock file expire-time from 15 to 25 seconds check-in: 169d94de4e user: mmgraham tags: v1.65, v1.6584
11:26
Changed sleep in kind-run from 2 to 18 seconds to allow server to come up to the point where get-list will find it. Cleaned up messages and comments. check-in: 2f8513eec5 user: mmgraham tags: v1.65
2021-03-13
00:34
restored the configf cleanup, with workaround for ext.scm check-in: 316c988564 user: mmgraham tags: v1.65
Changes

Modified configf.scm from [f97d938f06] to [d727d54436].

146
147
148
149
150
151
152
153

154
155
156
157
158
159
160
146
147
148
149
150
151
152

153
154
155
156
157
158
159
160







-
+







		       (lambda ()
			 (set! result ((eval (read)) ht))))
		     (set! result (conc "#{(" cmdtype ") "  cmd "}"))))
		(case cmdsym
		  ((system shell scheme)
		   (let ((delta (- (current-seconds) start-time)))
		     (if (> delta 2)
			 (debug:print-info 0 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)
			 (debug:print-info 2 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)
			 (debug:print-info 9 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)))))
		(loop (conc prestr result poststr)))
	      res))
	res)))

;; Run a shell command and return the output as a string
(define (shell cmd)

Modified runs.scm from [2583922f1c] to [07f8912d41].

499
500
501
502
503
504
505

506
507
508
509
510
511
512
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513







+







;; runs:run-tests is called from megatest.scm and itself
;;======================================================================
;;
;;  test-names: Comma separated patterns same as test-patts but used in selection 
;;              of tests to run. The item portions are not respected.
;;              FIXME: error out if /patt specified
;;            
;; run-count is passed from megatest.scm as configf:lookup *configdat* "setup" "reruns", or defaults to 1.
(define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names
  (let* ((keys               (keys:config-get-fields *configdat*))
	 (keyvals            (keys:target->keyval keys target))
	 (run-id             (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour")))  ;;  test-name)))
	 ;; (deferred          '()) ;; delay running these since they have a waiton clause
	 (runconfigf         (conc  *toppath* "/runconfigs.config"))
         (dbfile             (conc  *toppath* "/megatest.db"))
622
623
624
625
626
627
628
629

630
631
632
633
634
635
636
623
624
625
626
627
628
629

630
631
632
633
634
635
636
637







-
+







    ;; (set! required-tests     (lset-intersection equal? test-names all-test-names))
    
    ;; look up all tests matching the comma separated list of globs in
    ;; test-patts (using % as wildcard)

    ;; (set! test-names (delete-duplicates (tests:get-valid-tests *toppath* test-patts)))
    (debug:print-info 0 *default-log-port* "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " "))
    (debug:print-info 0 *default-log-port* "all tests:         " (string-intersperse (sort all-test-names string<) " "))
    (debug:print-info 2 *default-log-port* "all tests:         " (string-intersperse (sort all-test-names string<) " "))
    (debug:print-info 0 *default-log-port* "test names:        " (string-intersperse (sort test-names string<) " "))
    (debug:print-info 0 *default-log-port* "required tests:    " (string-intersperse (sort required-tests string<) " "))

    ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if
    ;; -keepgoing is specified
    (if (eq? *passnum* 0)
	(begin
2391
2392
2393
2394
2395
2396
2397
2398

2399
2400
2401
2402
2403
2404
2405
2392
2393
2394
2395
2396
2397
2398

2399
2400
2401
2402
2403
2404
2405
2406







-
+







		    ;; seek and kill in flight -runtests with % as testpatt here
		    ;; (if (equal? testpatt "%")
		    (tasks:kill-runner target run-name testpatt)
		    ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
		    (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((set-state-status)
		    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
		    (debug:print 1 *default-log-port* "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		    (debug:print 1 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
		   ((print-run)
		    (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
		    action)
		   ((run-wait)
		    (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete"))
		   ((archive)
		    (debug:print 1 *default-log-port* "Archiving/restoring (" (args:get-arg "-archive") ") data for run: " runkey " " (db:get-value-by-header run header "runname"))

Modified server.scm from [19fd15cc14] to [c7b3629e26].

109
110
111
112
113
114
115
116
117


118
119
120
121
122
123
124
109
110
111
112
113
114
115


116
117
118
119
120
121
122
123
124







-
-
+
+







    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((fs)   result)
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
153
154
155
156
157
158
159
160

161
162
163
164
165
166
167
168
169




170
171
172
173
174
175
176
153
154
155
156
157
158
159

160

161
162
163
164
165
166


167
168
169
170
171
172
173
174
175
176
177







-
+
-






-
-
+
+
+
+







	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 
;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 

(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))
        (dbprep-found 0)) 
    (handle-exceptions
	exn
199
200
201
202
203
204
205
206

207
208
209

210
211
212
213

214
215
216
217
218
219
220
200
201
202
203
204
205
206

207
208
209

210
211
212
213

214
215
216
217
218
219
220
221







-
+


-
+



-
+







			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 
                   (if dbprep-found
                      (begin
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (common:human-time))
                         (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting?
                      )
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))
                   )
		    (list #f #f #f #f)))))))))

;; get a list of servers with all relevant data
;; get a list of servers from the log files, with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then
232
233
234
235
236
237
238
239

240
241
242
243
244
245
246
233
234
235
236
237
238
239

240
241
242
243
244
245
246
247







-
+








        ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited.
	(let* ((server-logs-cmd  (conc "grep -iL exiting " areapath "/logs/server-*-*.log"))
               (server-logs   (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all))))
	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 1  *default-log-port* "There are no servers running")
                 (debug:print 1  *default-log-port* "There are no servers running at " (common:human-time))
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn
363
364
365
366
367
368
369
370

371
372
373
374
375

376
377
378
379
380






381
382
383
384
385
386
387
388
389
390
391


392


393
394


395
396


397
398
399
400


401
402
403
404

405
406
407
408
409
410
411
412
413
414
415
416
417

418
419
420
421
422
423
424
425

426

427

428


429
430
431
432
433
434
435
364
365
366
367
368
369
370

371
372
373
374
375

376
377
378
379
380

381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396

397
398
399
400
401
402
403
404
405


406
407
408
409


410
411
412
413
414

415
416
417











418
419

420
421
422
423
424

425
426
427

428

429
430
431
432
433
434
435
436
437







-
+




-
+




-
+
+
+
+
+
+










-
+
+

+
+


+
+
-
-
+
+


-
-
+
+



-
+


-
-
-
-
-
-
-
-
-
-
-
+

-





-
+

+
-
+
-
+
+








(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; wait for server=start-last to be three seconds old
;; if server-start-last exists, overwrite it. Otherwise loop recursively until it is old enough.
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))
	       (all-go   (> delta idletime))
               (old-server-key (with-input-from-file start-flag (lambda () (read-line))))
              )

          ;; write a new start-flag file, wait 0.25s, then if the previous start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
          ;; 
	  (if (and all-go
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag
		       (lambda ()
			 (print server-key)))
		     (thread-sleep! 0.25)
		     (let ((res (with-input-from-file start-flag
				  (lambda ()
				    (read-line)))))
		       (equal? server-key res))))
		       (equal? server-key res)))
                )
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe

           ;; If either of the above conditions are not true, print a "Gating server start" message, wait <idle-time>, then call this function recursively. 
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if all-go "server key does not match" "too soon to start another server"))
		(debug:print-info 0 *default-log-port* "server keys: from file: " old-server-key " needed: " server-key)
				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)

		(thread-sleep! idletime)
		(server:wait-for-server-start-last-flag areapath)))))))

;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;; kind start up of servers, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num ;; NOT USED. Waiting is handled by wait-for-server
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)
                              0))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously

      (let* (
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	;; (if	(> (- (current-seconds) when-run) run-delay)
	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 15)
	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
	  (thread-sleep! 18) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))

      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")))
      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
	;; (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
   )
)

;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))