Megatest

Diff
Login

Differences From Artifact [6d2db21290]:

To Artifact [bb2f7cfc15]:


109
110
111
112
113
114
115
116
117


118
119
120
121
122
123
124
109
110
111
112
113
114
115


116
117
118
119
120
121
122
123
124







-
-
+
+







    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
    ((http) (db:obj->string (vector success/fail query-sig result)))
    ((fs)   result)
    (else 
     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
     result)))

;; Given a run id start a server process    ### NOTE ### > file 2>&1 
;; if the run-id is zero and the target-host is set 
;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
  (let* ((curr-host   (get-host-name))
         ;; (attempt-in-progress (server:start-attempted? areapath))
         ;; (dot-server-url (server:check-if-running areapath))
153
154
155
156
157
158
159
160

161
162
163
164
165
166
167
168
169




170
171
172
173
174
175
176
153
154
155
156
157
158
159

160

161
162
163
164
165
166


167
168
169
170
171
172
173
174
175
176
177







-
+
-






-
-
+
+
+
+







	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 
;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 

(define (server:logf-get-start-info logf)
  (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx (regexp "^SERVER: dbprep"))
        (dbprep-found 0)) 
    (handle-exceptions
	exn
199
200
201
202
203
204
205
206
207


208
209

210
211
212
213

214
215
216
217
218
219
220
200
201
202
203
204
205
206


207
208
209

210
211
212
213

214
215
216
217
218
219
220
221







-
-
+
+

-
+



-
+







			(list (car dat) ;; host
			      (string->number (cadr dat)) ;; port
			      (string->number (caddr dat))
                              (cadr (cddr dat))))))
                (begin 
                   (if dbprep-found
                      (begin
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
                         (thread-sleep! 25)
                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (common:human-time))
                         (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting?
                      )
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))
                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))
                   )
		    (list #f #f #f #f)))))))))

;; get a list of servers with all relevant data
;; get a list of servers from the log files, with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
  (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
	(day-seconds (* 24 60 60)))
    ;; if the directory exists continue to get the list
    ;; otherwise attempt to create the logs dir and then
232
233
234
235
236
237
238
239

240
241
242
243
244
245
246
233
234
235
236
237
238
239

240
241
242
243
244
245
246
247







-
+








        ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited.
	(let* ((server-logs-cmd  (conc "grep -iL exiting " areapath "/logs/server-*-*.log"))
               (server-logs   (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all))))
	       (num-serv-logs (length server-logs)))
	  (if (or (null? server-logs) (= num-serv-logs 0))
              (let ()
                 (debug:print 1  *default-log-port* "There are no servers running")
                 (debug:print 1  *default-log-port* "There are no servers running at " (common:human-time))
	         '()
              )
	      (let loop ((hed  (string-chomp (car server-logs)))
			 (tal  (cdr server-logs))
			 (res '()))
		(let* ((mod-time  (handle-exceptions
				   exn
363
364
365
366
367
368
369
370




371
372
373
374
375

376
377
378
379
380
381






382
383
384

385
386
387
388

389
390
391
392





393
394

395
396


397
398


399
400


401
402
403
404

405
406
407

408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425











426
427
428
429
430
431
432
364
365
366
367
368
369
370

371
372
373
374
375
376
377
378

379
380
381
382
383


384
385
386
387
388
389
390
391

392


393

394




395
396
397
398
399
400
401
402


403
404
405
406
407
408


409
410
411
412
413

414
415
416

417








418









419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436







-
+
+
+
+




-
+




-
-
+
+
+
+
+
+


-
+
-
-

-
+
-
-
-
-
+
+
+
+
+


+
-
-
+
+


+
+
-
-
+
+



-
+


-
+
-
-
-
-
-
-
-
-

-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+








(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
  (if *my-client-signature* *my-client-signature*
      (let ((sig (server:mk-signature)))
        (set! *my-client-signature* sig)
        *my-client-signature*)))

;; wait for server=start-last to be three seconds old

;; if server-start-last exists, and wasn't old enough, wait <idle time>, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> and call this function recursively.
;;
(define (server:wait-for-server-start-last-flag areapath)
  (let* ((start-flag (conc areapath "/logs/server-start-last"))
	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
	 (server-key (conc (get-host-name) "-" (current-process-id))))
    (if (file-exists? start-flag)
	(let* ((fmodtime (file-modification-time start-flag))
	       (delta    (- (current-seconds) fmodtime))
	       (all-go   (> delta reftime)))
	  (if (and all-go
	       (old-enough   (> delta idletime))
              )

          ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
          ;; 
	  (if (and old-enough
		   (begin
                     (debug:print-info 0 *default-log-port* "Writing " start-flag)
		     (with-output-to-file start-flag
		     (with-output-to-file start-flag (lambda () (print server-key)))
		       (lambda ()
			 (print server-key)))
		     (thread-sleep! 0.25)
		     (let ((res (with-input-from-file start-flag
		     (let ((res (with-input-from-file start-flag (lambda () (read-line)))))
				  (lambda ()
				    (read-line)))))
		       (equal? server-key res))))
	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
		       (equal? server-key res)))
                )
	      #t

           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time>, then call this function recursively. 
	      (begin
		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "server key does not match" "too soon to start another server"))
				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
		(thread-sleep! reftime)

		(thread-sleep! idletime)
		(server:wait-for-server-start-last-flag areapath)))))))


        
;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
  ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
  ;; and wait for it to be at least 3 seconds old
  ;; and wait for it to be at least <server idletime> seconds old
  (server:wait-for-server-start-last-flag areapath)
  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
      (let* (
	     (call-num     (car last-run-dat))
	     (when-run     (cadr last-run-dat))
	     (run-delay    (+ (case call-num
				((0)    0)
				((1)   20)
				((2)  300)
				(else 600))
			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
	     (lock-file    (conc areapath "/logs/server-start.lock")))
	(if	(> (- (current-seconds) when-run) run-delay)
		(let* ((start-flag (conc areapath "/logs/server-start-last")))
		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
                  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
		  (system (conc "touch " start-flag)) ;; lazy but safe
		  (server:run areapath)
		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
		  (common:simple-file-release-lock lock-file)))
	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
	(let* ((start-flag (conc areapath "/logs/server-start-last")))
	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
	  (system (conc "touch " start-flag)) ;; lazy but safe
	  (server:run areapath)
	  (thread-sleep! 18) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
	  (common:simple-file-release-lock lock-file)))

      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
   )
)

;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
  (let ((give-up-time (+ (current-seconds) timeout)))
    (let loop ((server-info (server:check-if-running areapath))
	       (try-num    0))