Megatest

Check-in [c9180c4d63]
Login
Overview
Comment:more untested cleanup
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | multi-area
Files: files | file ages | folders
SHA1: c9180c4d635c026c34acb6757810e36636b7ac3d
User & Date: matt on 2015-04-05 23:16:36
Other Links: branch diff | manifest | tags
Context
2015-04-05
23:25
More untested changes check-in: 76c7c0f408 user: matt tags: multi-area
23:16
more untested cleanup check-in: c9180c4d63 user: matt tags: multi-area
23:02
Untested cleanup check-in: da7f14af0e user: matt tags: multi-area
Changes

Modified client.scm from [141f2f00f0] to [69db902477].

158
159
160
161
162
163
164
165

166
167
168
169
170
171
172
158
159
160
161
162
163
164

165
166
167
168
169
170
171
172







-
+







  (debug:print-info 2 "client:setup remaining-tries=" remaining-tries)
  (let* ((tdbdat         (tasks:open-db area-dat))
	 (transport-type (megatest:area-transport area-dat)))
    (if (<= remaining-tries 0)
	(begin
	  (debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	  (exit 1))
	(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
	(let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat area-dat) run-id)))
	  (debug:print-info 4 "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
	  (if server-dat
	      (let* ((iface     (tasks:hostinfo-get-interface server-dat))
		     (hostname  (tasks:hostinfo-get-hostname  server-dat))
		     (port      (tasks:hostinfo-get-port      server-dat))
		     (start-res (case transport-type
				  ((http)(http-transport:client-connect iface port))
185
186
187
188
189
190
191
192

193
194
195
196
197
198
199
185
186
187
188
189
190
191

192
193
194
195
196
197
198
199







-
+







		      start-res)
		    (begin    ;; login failed but have a server record, clean out the record and try again
		      (debug:print-info 0 "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
		      (case transport-type 
			((http)(http-transport:close-connections run-id)))
		      (common:del-remote! remote run-id)
		      (tasks:kill-server-run-id run-id)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat area-dat)
							   run-id 
							   (tasks:hostinfo-get-interface server-dat)
							   (tasks:hostinfo-get-port      server-dat)
							   " client:setup (server-dat = #t)")
		      (if (> remaining-tries 8)
			  (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
			  (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time

Modified dcommon.scm from [1f00e0f1a9] to [c6b78432ea].

521
522
523
524
525
526
527
528

529
530
531
532
533
534
535
521
522
523
524
525
526
527

528
529
530
531
532
533
534
535







-
+







	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 5
				     ))
	 (colnames       (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId"))
	 (updater        (lambda ()
			   (let ((servers (tasks:get-all-servers (db:delay-if-busy tdbdat))))
			   (let ((servers (tasks:get-all-servers (db:delay-if-busy tdbdat area-dat))))
			     (iup:attribute-set! servers-matrix "NUMLIN" (length servers))
			     ;; (set! colnum 0)
			     ;; (for-each (lambda (colname)
			     ;;    	 ;; (print "colnum: " colnum " colname: " colname)
			     ;;    	 (iup:attribute-set! servers-matrix (conc "0:" colnum) colname)
			     ;;    	 (set! colnum (+ 1 colnum)))
			     ;;           colnames)

Modified http-transport.scm from [d8101c25db] to [ac17ed4fcc].

141
142
143
144
145
146
147
148

149
150
151
152
153

154
155
156
157
158
159
160
161
162
163
164
165
166

167
168
169
170
171
172
173
141
142
143
144
145
146
147

148
149
150
151
152

153
154
155
156
157
158
159
160
161
162
163
164
165

166
167
168
169
170
171
172
173







-
+




-
+












-
+







	     ;; get_next_port goes here
	     (http-transport:try-start-server run-id
					      ipaddrstr
					      (portlogger:open-run-close portlogger:find-port area-dat)
					      server-id
					      area-dat))
	   (begin
	     (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server")
	     (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat area-dat) run-id ipaddrstr portnum " http-transport:try-start-server")
	     (print "ERROR: Tried and tried but could not start the server"))))
     ;; any error in following steps will result in a retry
     (set! *server-info* (list ipaddrstr portnum))
     (tasks:server-set-interface-port 
		     (db:delay-if-busy tdbdat)
		     (db:delay-if-busy tdbdat area-dat)
		     server-id 
		     ipaddrstr portnum)
     (debug:print 0 "INFO: Trying to start server on " ipaddrstr ":" portnum)
     ;; This starts the spiffy server
     ;; NEED WAY TO SET IP TO #f TO BIND ALL
     ;; (start-server bind-address: ipaddrstr port: portnum)
     (if config-hostname ;; this is a hint to bind directly
	 (start-server port: portnum bind-address: (if (equal? config-hostname "-")
						       ipaddrstr
						       config-hostname))
	 (start-server port: portnum))
     ;;  (portlogger:open-run-close portlogger:set-port portnum "released")
     (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) run-id ipaddrstr portnum " http-transport:try-start-server")
     (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat area-dat) run-id ipaddrstr portnum " http-transport:try-start-server")
     (debug:print 1 "INFO: server has been stopped"))))

;;======================================================================
;; S E R V E R   U T I L I T I E S 
;;======================================================================

;;======================================================================
388
389
390
391
392
393
394
395

396
397
398
399
400
401
402
388
389
390
391
392
393
394

395
396
397
398
399
400
401
402







-
+







			      sdat
                              (begin
				(debug:print-info 0 "Still waiting, last-sdat=" last-sdat)
                                (sleep 4)
				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
				    (begin
				      (debug:print 0 "ERROR: transport appears to have died, exiting server " server-id " for run " run-id)
				      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
				      (tasks:server-delete-record (db:delay-if-busy tdbdat area-dat) server-id "failed to start, never received server alive signature")
				      (exit))
				    (loop start-time
					  (equal? sdat last-sdat)
					  sdat)))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
417
418
419
420
421
422
423
424

425
426
427
428
429
430
431
432
433
434
435
436
437
438
439

440
441
442

443
444
445
446
447
448

449
450

451
452
453
454
455
456
457
417
418
419
420
421
422
423

424
425
426
427
428
429
430
431
432
433
434
435
436
437
438

439
440
441

442
443
444
445
446
447

448
449

450
451
452
453
454
455
456
457







-
+














-
+


-
+





-
+

-
+







			    ((> bad-sync-count 10) ;; time to give up
			     (http-transport:server-shutdown server-id port area-dat))
			    (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
			     (thread-sleep! 5)
			     (loop count server-state (+ bad-sync-count 1)))))
	     ((exn)
	      (debug:print 0 "ERROR: error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running crashed")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat area-dat) server-id " http-transport:keep-running crashed")
	      (exit)))
	    (set! sync-time  (- (current-milliseconds) start-time))
	    (set! rem-time (quotient (- 4000 sync-time) 1000))
	    (debug:print 2 "SYNC: time= " sync-time ", rem-time=" rem-time)
	    
	    (if (and (<= rem-time 4)
		     (> rem-time 0))
		(thread-sleep! rem-time)
		(thread-sleep! 4))) ;; fallback for if the math is changed ...

	  ;;
	  ;; no *inmemdb* yet, set running after our first pass through and start the db
	  ;;
	  (if (eq? server-state 'available)
	      (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers
	      (let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat area-dat) run-id))) ;; try to ensure no double registering of servers
		(if (equal? new-server-id server-id)
		    (begin
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
		      (tasks:server-set-state! (db:delay-if-busy tdbdat area-dat) server-id "dbprep")
		      (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access
		      (set! *inmemdb*  (db:setup run-id))
		      ;; force initialization
		      ;; (db:get-db *inmemdb* #t)
		      (db:get-db *inmemdb* run-id)
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))
		      (tasks:server-set-state! (db:delay-if-busy tdbdat area-dat) server-id "running"))
		    (begin ;; gotta exit nicely
		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
		      (tasks:server-set-state! (db:delay-if-busy tdbdat area-dat) server-id "collision")
		      (http-transport:server-shutdown server-id port area-dat))))))
      
      (if (< count 1) ;; 3x3 = 9 secs aprox
	  (loop (+ count 1) 'running bad-sync-count))
      
      ;; Check that iface and port have not changed (can happen if server port collides)
      (mutex-lock! *heartbeat-mutex*)
496
497
498
499
500
501
502
503

504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522

523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543

544
545
546
547
548
549

550
551
552
553
554

555
556
557
558
559
560
561
496
497
498
499
500
501
502

503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521

522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542

543
544
545
546
547
548

549
550
551
552
553

554
555
556
557
558
559
560
561







-
+


















-
+




















-
+





-
+




-
+







    (debug:print-info 0 "Starting to shutdown the server.")
    ;; need to delete only *my* server entry (future use)
    (set! *time-to-exit* #t)
    (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
    ;;
    ;; start_shutdown
    ;;
    (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
    (tasks:server-set-state! (db:delay-if-busy tdbdat area-dat) server-id "shutting-down")
    (portlogger:open-run-close portlogger:set-port area-dat port "released")
    (thread-sleep! 5)
    (debug:print-info 0 "Max cached queries was    " *max-cache-size*)
    (debug:print-info 0 "Number of cached writes   " *number-of-writes*)
    (debug:print-info 0 "Average cached write time "
		      (if (eq? *number-of-writes* 0)
			  "n/a (no writes)"
			  (/ *writes-total-delay*
			     *number-of-writes*))
		      " ms")
    (debug:print-info 0 "Number non-cached queries "  *number-non-write-queries*)
    (debug:print-info 0 "Average non-cached time   "
		      (if (eq? *number-non-write-queries* 0)
			  "n/a (no queries)"
			  (/ *total-non-write-delay* 
			     *number-non-write-queries*))
		      " ms")
    (debug:print-info 0 "Server shutdown complete. Exiting")
    (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete")
    (tasks:server-delete-record (db:delay-if-busy tdbdat area-dat) server-id " http-transport:keep-running complete")
    (exit)))

;; all routes though here end in exit ...
;;
;; start_server? 
;;
(define (http-transport:launch run-id area-dat)
  (let* ((tdbdat (tasks:open-db area-dat)))
    (set! *run-id*   run-id)
    (if (args:get-arg "-daemonize")
	(begin
	  (daemon:ize)
	  (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
	      (begin
		(current-error-port *alt-log-file*)
		(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id area-dat)
	(begin
	  (debug:print 0 "INFO: Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id area-dat))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat area-dat) run-id area-dat))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id area-dat)
		(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat area-dat) run-id area-dat)
		      (- remtries 1)))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
		(debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat area-dat) " http-transport:launch")
		))
	  (let* ((th2 (make-thread (lambda ()
				     (debug:print-info 0 "Server run thread started")
				     (http-transport:run 
				      (if (args:get-arg "-server")
					  (args:get-arg "-server")
					  "-")

Modified megatest.scm from [f59b483f32] to [bf5b616a59].

687
688
689
690
691
692
693
694

695
696
697
698
699
700
701
687
688
689
690
691
692
693

694
695
696
697
698
699
700
701







-
+







;;		       (set! *megatest-db* (make-dbr:dbstruct path: toppath local: #t))))))))))

(if (or (args:get-arg "-list-servers")
	(args:get-arg "-stop-server"))
    (let ((tl (launch:setup-for-run *area-dat*)))
      (if tl 
	  (let* ((tdbdat  (tasks:open-db *area-dat*))
		 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat)))
		 (servers (tasks:get-all-servers (db:delay-if-busy tdbdat *area-dat*)))
		 (fmtstr  "~5a~12a~8a~20a~24a~10a~10a~10a~10a\n")
		 (servers-to-kill '())
		 (killinfo   (args:get-arg "-stop-server"))
		 (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f))
		 (sid        (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f)))
	    (format #t fmtstr "Id" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport")
	    (format #t fmtstr "==" "=====" "===" "====" "=================" "======" "========" "=====" "=========")
716
717
718
719
720
721
722
723

724
725

726
727
728
729
730
731
732
716
717
718
719
720
721
722

723
724

725
726
727
728
729
730
731
732







-
+

-
+







		      (killed     #f)
		      (status     (< last-update 20)))
		 ;;   (zmq-sockets (if status (server:client-connect hostname port) #f)))
		 ;; no need to login as status of #t indicates we are connecting to correct 
		 ;; server
		 (if (equal? state "dead")
		     (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day.
			 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete))
			 (tasks:server-deregister (db:delay-if-busy tdbdat *area-dat*) hostname pullport: pullport pid: pid action: 'delete))
		     (if (> last-update 20)        ;; Mark as dead if not updated in last 20 seconds
			 (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid)))
			 (tasks:server-deregister (db:delay-if-busy tdbdat *area-dat*) hostname pullport: pullport pid: pid)))
		 (format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update
			 (if status "alive" "dead") transport)
		 (if (or (equal? id sid)
			 (equal? sid 0)) ;; kill all/any
		     (begin
		       (debug:print-info 0 "Attempting to stop server with pid " pid)
		       (tasks:kill-server status hostname pullport pid transport)))))

Modified nmsg-transport.scm from [fa92993bb8] to [9c105ef210].

70
71
72
73
74
75
76
77
78


79
80
81
82

83
84
85
86
87
88
89
90

91
92
93
94
95
96
97
70
71
72
73
74
75
76


77
78
79
80
81

82
83
84
85
86
87
88
89

90
91
92
93
94
95
96
97







-
-
+
+



-
+







-
+







					 (nmsg-transport:try-start-server dbstruct run-id start-port server-id))
				       "server thread"))
	 (tdbdat          (tasks:open-db area-dat)))
    (thread-start! server-thread)
    (thread-sleep! 0.1)
    (if (nmsg-transport:ping hostn start-port timeout: 2 expected-key: (current-process-id))
	(let ((interface (if (equal? hostn "-")(get-host-name) hostn)))
	  (tasks:server-set-interface-port (db:delay-if-busy tdbdat) server-id interface start-port)
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
	  (tasks:server-set-interface-port (db:delay-if-busy tdbdat area-dat) server-id interface start-port)
	  (tasks:server-set-state! (db:delay-if-busy tdbdat area-dat) server-id "dbprep")
	  (set! *server-info* (list hostn start-port)) ;; probably not needed anymore? currently used by keep-running
	  (thread-sleep! 3) ;; give some margin for queries to complete before switching from file based access to server based access
	  ;; (set! *inmemdb*  dbstruct)
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
	  (tasks:server-set-state! (db:delay-if-busy tdbdat area-dat) server-id "running")
	  (thread-start! (make-thread
			  (lambda ()(nmsg-transport:keep-running server-id run-id area-dat))
			  "keep running"))
	  (thread-join! server-thread))
	(if (> retrynum 0)
	    (begin
	      (debug:print 0 "WARNING: Failed to connect to server (self) on host " hostn ":" start-port ", trying again.")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat area-dat) server-id "failed to start, never received server alive signature")
	      (portlogger:open-run-close portlogger:set-failed area-dat start-port)
	      (nmsg-transport:run dbstruct area-dat hostn run-id server-id))
	    (begin
	      (debug:print 0 "ERROR: could not find an open port to start server on. Giving up")
	      (exit 1))))))

(define (nmsg-transport:try-start-server dbstruct run-id portnum server-id)
122
123
124
125
126
127
128
129

130
131
132
133
134
135
136

137
138
139
140
141
142
143
144

145
146
147
148
149
150
151
122
123
124
125
126
127
128

129
130
131
132
133
134
135

136
137
138
139
140
141
142
143

144
145
146
147
148
149
150
151







-
+






-
+







-
+







    ;;           (begin
    ;;     	(current-error-port *alt-log-file*)
    ;;     	(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id area-dat)
	(begin
	  (debug:print-info 0 "Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat area-dat) run-id))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(if (not (server:check-if-running run-id area-dat))
		    (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
		    (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat area-dat) run-id)
			  (- remtries 1))
		    (begin
		      (debug:print-info 0 "Another server took the slot, exiting")
		      (exit 0))))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
		(debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat area-dat) " http-transport:launch")
		))
	  ;; locked in a server id, try to start up
	  (nmsg-transport:run dbstruct area-dat hostn run-id server-id))
      (set! *didsomething* #t)
      (exit))))

;;======================================================================
301
302
303
304
305
306
307
308

309
310
311
312
313
314
315
301
302
303
304
305
306
307

308
309
310
311
312
313
314
315







-
+







            (begin
              (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
              (loop 0))
            (begin
              (debug:print-info 0 "Starting to shutdown the server.")
              (set! *time-to-exit* #t)
	      (db:sync-touched *inmemdb* run-id force-sync: #t)
              (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running")
              (tasks:server-delete-record (db:delay-if-busy tdbdat area-dat) server-id " http-transport:keep-running")
              (debug:print-info 0 "Server shutdown complete. Exiting")
              (exit)
	      ))))))

;;======================================================================
;; C L I E N T S
;;======================================================================

Modified olddashboard.scm from [af152b63b7] to [ff2a7db8fc].

483
484
485
486
487
488
489
490

491
492
493
494
495
496
497
483
484
485
486
487
488
489

490
491
492
493
494
495
496
497







-
+







	 (servers-matrix (iup:matrix #:expand "YES"
				     #:numcol 7
				     #:numcol-visible 7
				     #:numlin-visible 5
				     ))
	 (colnames       (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId"))
	 (updater        (lambda ()
			   (let ((servers (tasks:get-all-servers (db:delay-if-busy tdbdat))))
			   (let ((servers (tasks:get-all-servers (db:delay-if-busy tdbdat area-dat))))
			     (iup:attribute-set! servers-matrix "NUMLIN" (length servers))
			     ;; (set! colnum 0)
			     ;; (for-each (lambda (colname)
			     ;;    	 ;; (print "colnum: " colnum " colname: " colname)
			     ;;    	 (iup:attribute-set! servers-matrix (conc "0:" colnum) colname)
			     ;;    	 (set! colnum (+ 1 colnum)))
			     ;;           colnames)

Modified server.scm from [7ddbaf8b7c] to [606706ec81].

157
158
159
160
161
162
163
164

165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

185
186
187
188
189
190
191
192
193
194
195
196
197
198

199
200
201
202
203
204
205
157
158
159
160
161
162
163

164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183

184
185
186
187
188
189
190
191
192
193
194
195
196
197

198
199
200
201
202
203
204
205







-
+



















-
+













-
+







(define (server:try-running run-id area-dat)
  (if (eq? run-id 0)
      (server:run run-id area-dat)
      (rmt:start-server run-id)))

(define (server:check-if-running run-id area-dat)
  (let ((tdbdat (tasks:open-db area-dat)))
    (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id))
    (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat area-dat) run-id))
	       (trycount 0))
    (if server
	;; note: client:start will set (common:get-remote remote). this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;
	(let ((res (case (megatest:area-transport area-dat)
		     ((http)(server:ping-server run-id 
						(tasks:hostinfo-get-interface server)
						(tasks:hostinfo-get-port      server)))
		     ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
						 (tasks:hostinfo-get-port      server)
						 timeout: 2)))))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      #t
	      (begin
		(debug:print-info 0 "server at " server " not responding, removing record")
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat area-dat) run-id 
				" server:check-if-running")
		res)))
	#f))))

;; called in megatest.scm, host-port is string hostname:port
;;
(define (server:ping run-id host:port area-dat)
  (let ((tdbdat (tasks:open-db area-dat)))
    (let* ((host-port (let ((slst (string-split   host:port ":")))
			(if (eq? (length slst) 2)
			    (list (car slst)(string->number (cadr slst)))
			    #f)))
	   (toppath       (launch:setup-for-run))
	   (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f)))
	   (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat area-dat) run-id) #f)))
      (if (not run-id)
	  (begin
	    (debug:print 0 "ERROR: must specify run-id when doing ping, -run-id n")
	    (print "ERROR: No run-id")
	    (exit 1))
	  (if (and (not host-port)
		   (not server-db-dat))

Modified tasks.scm from [f2e471c669] to [9f3687b7ab].

377
378
379
380
381
382
383
384

385
386

387
388
389
390
391
392
393
394
395
396

397
398
399
400
401
402
403
377
378
379
380
381
382
383

384
385

386
387
388
389
390
391
392
393
394
395

396
397
398
399
400
401
402
403







-
+

-
+









-
+







;; 	  (debug:print-info 0 "Max avg query time of " maxqry "ms exceeds limit of " threshold "ms, server needed for run-id " run-id "."))
;;       #t)
;;      (else
;;       #f))))

;; try to start a server and wait for it to be available
;;
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries)
(define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries area-dat)
  ;; ensure a server is running for this run
  (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))
  (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat area-dat) run-id))
	     (delay-time 0))
      (if (and (not server-dat)
	       (< delay-time delay-max-tries))
	  (begin
	    (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id)
		(debug:print 0 "Try starting server for run-id " run-id))
	    (thread-sleep! (/ (random 2000) 1000))
	    (server:kind-run run-id)
	    (thread-sleep! (min delay-time 1))
	    (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1))))))
	    (loop (tasks:get-server (db:delay-if-busy tdbdat area-dat) run-id)(+ delay-time 1))))))

(define (tasks:get-all-servers mdb)
  (let ((res '()))
    (sqlite3:for-each-row
     (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id)
       ;;                       0  1     2         3      4     5          6        7     8          9          10        11     12
       (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res)))
428
429
430
431
432
433
434
435

436
437
438
439
440

441
442
443

444
445
446
447
448
449
450
428
429
430
431
432
433
434

435
436
437
438
439

440
441
442

443
444
445
446
447
448
449
450







-
+




-
+


-
+







  (unsetenv "TARGETHOST_LOGF")
  (unsetenv "TARGETHOST"))
 
;; look up a server by run-id and send it a kill, also delete the record for that server
;;
(define (tasks:kill-server-run-id run-id area-dat #!key (tag "default"))
  (let* ((tdbdat  (tasks:open-db area-dat))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat) run-id)))
	 (sdat    (tasks:get-server (db:delay-if-busy tdbdat area-dat) run-id)))
    (if sdat
	(let ((hostname (vector-ref sdat 6))
	      (pid      (vector-ref sdat 5))
	      (server-id (vector-ref sdat 0)))
	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
	  (tasks:server-set-state! (db:delay-if-busy tdbdat area-dat) server-id "killed")
	  (debug:print-info 0 "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
	  (tasks:kill-server hostname pid)
	  (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
	  (tasks:server-delete-record (db:delay-if-busy tdbdat area-dat) server-id tag) )
	(debug:print-info 0 "No server found for run-id " run-id ", nothing to kill"))
    ;; (sqlite3:finalize! tdb)
    ))
    
;;======================================================================
;; M O N I T O R S
;;======================================================================
696
697
698
699
700
701
702
703

704
705
706
707
708

709
710
711
712
713
714
715
696
697
698
699
700
701
702

703
704
705
706
707

708
709
710
711
712
713
714
715







-
+




-
+







      exn
      '()
      (sqlite3:first-row db "SELECT id,action,owner,state,target,name,testpatt,keylock,params WHERE
                               params LIKE ? AND state LIKE ? AND action LIKE ? AND testpatt LIKE ?;"
			 param-key state-patt action-patt test-patt)))))


(define (tasks:find-task-queue-records dbstruct target run-name test-patt state-patt action-patt)
(define (tasks:find-task-queue-records dbstruct area-dat target run-name test-patt state-patt action-patt)
  ;; (handle-exceptions
  ;;  exn
  ;;  '()
  ;;  (sqlite3:first-row
  (let ((db (db:delay-if-busy (db:get-db dbstruct #f)))
  (let ((db (db:delay-if-busy (db:get-db dbstruct #f) area-dat))
	(res '()))
    (sqlite3:for-each-row 
     (lambda (a . b)
       (set! res (cons (cons a b) res)))
     db "SELECT id,action,owner,state,target,name,testpatt,keylock,params FROM tasks_queue 
           WHERE
              target = ? AND name = ? AND state LIKE ? AND action LIKE ? AND testpatt LIKE ?;"