Megatest

Diff
Login

Differences From Artifact [a92ff9f544]:

To Artifact [2a0b975322]:


135
136
137
138
139
140
141
142

143
144
145
146
147
148


149

150
151


152
153
154
155
156
157
158
135
136
137
138
139
140
141

142
143
144
145
146
147
148
149
150

151


152
153
154
155
156
157
158
159
160







-
+






+
+
-
+
-
-
+
+







			   pid: pid)))
	       (hash-table-set! (tt-conns ttdat) dbfname conn)
	       ;; verify we can talk to this server
	       (if (tt:ping host port server-id)
		   conn
		   (let* ((curr-secs (current-seconds)))
		     ;; rm the (last server) would go here
		     (if (> (- curr-secs (tt-last-serv-start ttdat)) 30)
		     (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
			 (begin
			   (tt-last-serv-start-set! ttdat curr-secs)
			   (server-start-proc))) ;; don't try and start server unless 30 sec has gone by since last attempt
		     (thread-sleep! 1)
		     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))
	    (else
	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; really do not want to swamp the machine with servers
		 (begin
	     (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
	     (tt-last-serv-start-set! ttdat (current-seconds))
	     (server-start-proc)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))))
	     (thread-sleep! 1)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
    
(define (tt:ping host port server-id)
  (let*  ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id
    ;;
    ;; need two threads, one a 5 second timer
168
169
170
171
172
173
174


175
176
177
178
179
180
181
182
183







184
185
186
187
188




189
190
191
192



193
194
195
196


197



198


199
200




201
202
203
204
205
206
207
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195




196
197
198
199
200



201
202
203
204
205
206
207
208
209

210
211
212
213
214
215


216
217
218
219
220
221
222
223
224
225
226







+
+









+
+
+
+
+
+
+

-
-
-
-
+
+
+
+

-
-
-
+
+
+




+
+
-
+
+
+

+
+
-
-
+
+
+
+







	     #f)))
      (else
       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
       #f))))

;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
  (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
    (if conn
	;; have connection, call the server
	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
	  ;; res is (status errmsg result meta)
	  (match res
	    ((status errmsg result meta)
	     (if (list? meta)
		 (let* ((delay-wait (alist-ref 'delay-wait meta)))
		   (if (and (number? delay-wait)
			    (> delay-wait 0))
		       (begin
			 (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
			 (thread-sleep! delay-wait)))))
	     (case status
	       ((busy)
		(debug:print 0 *default-log-port* "WARNING: server is overloaded, will try again in few seconds.")
		(thread-sleep! 2)
		(tt:handler  ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((busy) ;; result will be how long the server wants you to delay
		(debug:print 0 *default-log-port* "WARNING: server is overloaded, will try again in "result" seconds.")
		(thread-sleep! (if (number? result) result 2))
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a second.")
		(thread-sleep! 1)
		(tt:handler  ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))
		(debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a 1/4 second.")
		(thread-sleep! 0.25)
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else
	     (if (not res)
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
		 (begin ;; let* ((srvfile (tt-conn-servinf-file ))) ;; server likely died
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
			(servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname)))
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (if (file-exists? servinf)
		       (begin
		   (debug:print 0 *default-log-port* "INFO: connection to server broken, reconnecting.")
		   (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))
			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", attempting to remove servinfo file.")
			 (delete-file* servinf))
		       (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf))
		   (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
		 (assert #f "FATAL: tt:handler received bad data "res)))))
	(begin
	  (thread-sleep! 1) ;; give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))

	;; no conn yet, find and or start and find a server
;; 	(let* ((server (tt:find-server ttdat dbfname)))
289
290
291
292
293
294
295
296

297
298
299
300
301
302
303
308
309
310
311
312
313
314

315
316
317
318
319
320
321
322







-
+







;;
(define (tt:start-server areapath run-id dbfname handler keys)
  (assert areapath "FATAL: areapath not provided for tt:start-server")
  ;; is there already a server for this dbfile? Then exit.
  (let* ((ttdat   (make-tt areapath: areapath)))
    ;; (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
    ;; (if (null? servers)
    (let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id (dbfile:db-init-proc) keys)))
    (let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
      (tt-handler-set! ttdat (handler dbstruct))
      (let* ((tcp-thread (make-thread
			  (lambda ()
			    (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
			  "tcp-server-thread"))
	     (run-thread (make-thread
			  (lambda ()
321
322
323
324
325
326
327
328


329
330
331
332
333
334
335
336
337
338
339
340

341
342
343
344

345
346
347
348
349
350

351
352
353
354
355
356
357
358
359
360
361
362
363
364
365


366

367
368
369
370
371
372






373
374
375




376
377
378
379
380
381
382
340
341
342
343
344
345
346

347
348
349
350
351
352
353
354
355
356
357
358
359

360
361
362
363

364
365
366
367
368
369

370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387

388






389
390
391
392
393
394
395
396

397
398
399
400
401
402
403
404
405
406
407







-
+
+











-
+



-
+





-
+















+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+


-
+
+
+
+







    ;; (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
    ;; (exit)))))
    ))

(define (tt:keep-running ttdat dbfname dbstruct)
  ;; verfiy conn for ready
  ;; listener socket has been started by this stage
  (thread-sleep! 1)
  ;; wait for a port before creating the registration file
  ;;
  (let* ((cleanup (lambda ()
		    (if (tt-cleanup-proc ttdat)
			((tt-cleanup-proc ttdat))))))
    (let loop ((count 0))
      (if (> count 60)
	  (begin
	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
	    (exit 1))
	  (if (not (tt-port ttdat)) ;; no connection yet
	      (let* ((last-update (dbr:dbstruct-last-update dbstruct))
		     (curr-secs   (current-seconds)))
		(if (> (- curr-secs last-update) 3) ;; every 3-4 seconds
		(if (> (- curr-secs last-update) 3) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem?
		    (begin
		      ((dbr:dbstruct-sync-proc dbstruct) last-update)
		      (dbr:dbstruct-last-update-set! curr-secs)))
		(thread-sleep! 1)
		(thread-sleep! 0.25)
		(loop (+ count 1))))))
    
    (tt:create-server-registration-file ttdat dbfname)
    ;; now start watching the last-access, if it hasn't been touched
    ;; in over ten seconds we exit
    (thread-sleep! 1)
    (thread-sleep! 0.05) ;; any real need for delay here?
    (let loop ()
      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
	     (ok      (cond
		       ((null? servers) #f) ;; not ok
		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
				(tt-servinf-file ttdat))
			(debug:print-info 0 *default-log-port* "Keep running, I'm the top server.")
			#t)
		       (else
			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
			(let* ((leadsrv (car servers)))
			  (match leadsrv
			    ((host port startseconds server-id pid dbfname servinfofile)
			     (if (tt:ping host port server-id)
				 #f ;; not the server, but all good, want to exit
				 (if (and (file-exists? servinfofile)
					  (> (- (current-seconds)(file-modification-time servinfofile)) 5))
				 (begin
				     (begin
				   ;; what to do here? remove the offending file?
				   (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
				   (delete-file* servinfofile)
				   #t ;; not the server but the server is not reachable
				   )))
			    (else
				       ;; can't ping and file has been on disk 5 seconds, go ahead and try to remove it
				       (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
				       (delete-file* servinfofile)
				       #t) ;; not the server but the server is not reachable
				     #t)))
			    (else ;; should never get here
			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
			     (assert #f "Bad server record "leadsrv))))))))
	(if (not ok)
	(if ok
	    ;; (if (> *api-process-request-count* 0) ;; have requests in flight
	    ;;	(tt-last-access-set! ttdat (current-seconds)))
	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
	    (begin
	      (cleanup)
	      (exit)))
	
	(if (< (- (current-seconds) (tt-last-access ttdat)) 60)
	    (begin
	      (thread-sleep! 5)
464
465
466
467
468
469
470
471
472
473
474




475
476
477
478



479
480
481
482





483
484
485
486
487
488
489






490
491
492
493
494
495
496
497
498
499
500
501
502
503














504
505
506
507
508
509
510
511

512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527

528
529
530
531
532
533
534
489
490
491
492
493
494
495




496
497
498
499




500
501
502




503
504
505
506
507







508
509
510
511
512
513














514
515
516
517
518
519
520
521
522
523
524
525
526
527








528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543

544
545
546
547
548
549
550
551







-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
-
-
-
-
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
+















-
+







;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
;;
(define (tt:server-get-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
    (handle-exceptions
     exn
     (begin
       ;; WARNING: this is potentially dangerous to blanket ignore the errors
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
       (if (file-exists? logf)
	   (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn))
       bad-dat) ;; no idea what went wrong, call it a bad server
     (with-input-from-file
			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn=" exn)
			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))
	 logf
       (lambda ()
	 (let loop ((inl  (read-line))
		    (lnum 0))
       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))
	   (if (not (eof-object? inl))
	       (let ((mlst (string-match server-rx inl))
		     (dbprep (string-match dbprep-rx inl)))
		 (if dbprep (set! dbprep-found 1))
		 (if (not mlst)
		     (if (< lnum 500) ;; give up if more than 500 lines of server log read
			 (loop (read-line)(+ lnum 1))
	     (let ((mlst (string-match server-rx inl))
		   (dbprep (string-match dbprep-rx inl)))
	       (if dbprep (set! dbprep-found 1))
	       (if (not mlst)
		   (if (> lnum 500) ;; give up if more than 500 lines of server log read
		       bad-dat
			 (begin 
                           (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )
                           bad-dat))
		     (match mlst
			    ((_ host port start server-id pid dbfname)
			     (list host
				   (string->number port)
				   (string->number start)
				   server-id
				   (string->number pid)
				   dbfname
				   logf))
			    (else
			     (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
		       (if (null? tail)
			   bad-dat
			   (loop (car tail)(cdr tail)(+ lnum 1))))
		   (match mlst ;; have a not null list
		     ((_ host port start server-id pid dbfname)
		      (list host
			    (string->number port)
			    (string->number start)
			    server-id
			    (string->number pid)
			    dbfname
			    logf))
		     (else
		      (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
			     bad-dat))))
	       (begin 
		 (if dbprep-found
		     (begin
		       (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
		       (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
		     (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
		 bad-dat))))))))
		      bad-dat)))))))))

;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
;; if the target-host is set 
;; try running on that host
;;   incidental: rotate logs in logs/ dir.
;;
(define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
  (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
  (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
  (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
  (let* ((logfile   (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
	 (cmdln     (conc
		     mtexe
		     " -server - ";; (or target-host "-")
		     " -m testsuite:" testsuite
		     " -run-id " (or run-id "main")
		     ;; " -run-id " (or run-id "main") ;; NO, we do NOT want to have run id as part of this
		     " -db "  (dbmod:run-id->dbfname run-id)
		     " " profile-mode
		     ))) ;; (conc " >> " logfile " 2>&1 &")))))
    ;; we want the remote server to start in *toppath* so push there
    ;; (push-directory areapath) ;; use cd in the command line instead
    (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)"...")
    ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
595
596
597
598
599
600
601


602
603
604
605
606
607
608







-
-







  (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
	 (addr  (tt:get-best-guess-address (get-host-name)))) ;; (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))
    (tt-port-set!      uconn port)
    (tt-host-set!      uconn addr)
    (tt-host-port-set! uconn (conc addr":"port))
    (tt-socket-set!    uconn tlsn)
    uconn))



;;======================================================================
;; utils
;;======================================================================

;; Generate a unique signature for this server
(define (tt:mk-signature areapath)