Megatest

Diff
Login

Differences From Artifact [6d2db21290]:

To Artifact [99d72bd3eb]:


109
110
111
112
113
114
115
116
117


118
119
120
121
122
123
124
109
110
111
112
113
114
115


116
117
118
119
120
121
122
123
124







-
-
+
+







    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((fs)   result)
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
152
153
154
155
156
157
158
159
160


161
162
163
164
165
166
167
168
169




170
171
172
173
174
175
176
152
153
154
155
156
157
158


159
160

161
162
163
164
165
166


167
168
169
170
171
172
173
174
175
176
177







-
-
+
+
-






-
-
+
+
+
+







	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 
;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 

(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))
        (dbprep-found 0)) 
    (handle-exceptions
	exn
199
200
201
202
203
204
205
206
207


208
209

210
211
212
213

214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234





235
236
237
238
239

240
241
242
243
244
245
246
200
201
202
203
204
205
206


207
208
209

210
211
212
213

214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233


234
235
236
237
238

239
240
241

242
243
244
245
246
247
248
249







-
-
+
+

-
+



-
+



















-
-
+
+
+
+
+
-



-
+







			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 
                   (if dbprep-found
                      (begin
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
                         (thread-sleep! 25)
                         (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
                         (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting?
                      )
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))
                   )
		    (list #f #f #f #f)))))))))

;; get a list of servers with all relevant data
;; get a list of servers from the log files, with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then
    ;; continue
    (if (if (directory-exists? (conc areapath "/logs"))
	    '()
	    (if (file-write-access? areapath)
		(begin
		  (condition-case
		   (create-directory (conc areapath "/logs") #t)
		   (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
		   (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
		  (directory-exists? (conc areapath "/logs")))
		'()))

        ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited.
	(let* ((server-logs-cmd  (conc "grep -iL exiting " areapath "/logs/server-*-*.log"))
        ;; Get the list of server logs. First remove logs for servers that have exited.
	(let* (
               ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers.
               ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'")))
               (server-logs   (glob (conc areapath "/logs/server-*-*.log")))
               (server-logs   (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all))))
	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 1  *default-log-port* "There are no servers running")
                 (debug:print 2  *default-log-port* "There are no servers running at " (common:human-time))
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn
363
364
365
366
367
368
369
370




371
372
373
374
375

376
377
378
379
380
381







382
383
384


385
386
387
388

389
390
391
392





393
394

395
396


397
398


399
400


401
402
403
404

405
406
407

408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425











426
427
428
429
430
431
432
366
367
368
369
370
371
372

373
374
375
376
377
378
379
380

381
382
383
384
385


386
387
388
389
390
391
392
393


394
395


396

397




398
399
400
401
402
403
404
405


406
407
408
409
410
411


412
413
414
415
416

417
418
419

420








421









422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439







-
+
+
+
+




-
+




-
-
+
+
+
+
+
+
+

-
-
+
+
-
-

-
+
-
-
-
-
+
+
+
+
+


+
-
-
+
+


+
+
-
-
+
+



-
+


-
+
-
-
-
-
-
-
-
-

-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+








(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; wait for server=start-last to be three seconds old

;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))
	  (if (and all-go
	       (old-enough   (> delta idletime))
               (new-server-key "")
              )

          ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
	  ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
           (if (and old-enough
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag
                     (debug:print-info 2 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag (lambda () (print server-key)))
		       (lambda ()
			 (print server-key)))
		     (thread-sleep! 0.25)
		     (let ((res (with-input-from-file start-flag
		     (set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))
				  (lambda ()
				    (read-line)))))
		       (equal? server-key res))))
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
		     (equal? server-key new-server-key))
                )
	      #t

           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time> + 1, then call this function recursively. 
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)

		(thread-sleep! ( + 1 idletime))
		(server:wait-for-server-start-last-flag areapath)))))))


        
;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
      (let* (
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(let* ((start-flag (conc areapath "/logs/server-start-last")))
		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
                  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
		  (system (conc "touch " start-flag)) ;; lazy but safe
		  (server:run areapath)
		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
	  (debug:print-info  2 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))

      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
   )
)

;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))
447
448
449
450
451
452
453
454

455
456
457
458
459
460

461
462
463
464
465
466
467
454
455
456
457
458
459
460

461
462
463
464
465
466

467
468
469
470
471
472
473
474







-
+





-
+







  (let ((ns (string->number
	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
    (or ns numservers)))

;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
  (let* ((ns            (server:get-num-servers))
  (let* ((ns            (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
	 (servers       (server:get-best (server:get-list areapath))))
    (if (or (and servers
		 (null? servers))
	    (not servers)
	    (and (list? servers)
		 (< (length servers) (random ns)))) ;; somewhere between 0 and numservers
		 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
        #f
        (let loop ((hed (car servers))
                   (tal (cdr servers)))
          (let ((res (server:check-server hed)))
            (if res
                hed
                (if (null? tal)