Megatest

Changes On Branch ccccfdb50a36f1c8
Login

Changes In Branch v1.80-disconnect-tests Excluding Merge-Ins

This is equivalent to a diff from d310aee913 to ccccfdb50a

2023-01-16
03:32
Isolated fix from v1.80-disconnected-tests for TCP ports issue (not tested yet) check-in: da2bac2043 user: matt tags: v1.80
03:21
Merged v1.80 Leaf check-in: ccccfdb50a user: matt tags: v1.80-disconnect-tests
2023-01-15
21:45
Appear to have fixed the growing connections issue check-in: f78750d901 user: matt tags: v1.80-disconnect-tests
2023-01-13
13:37
I guess close-idle-connections is not available? don't get it but getting rid of it... check-in: d310aee913 user: mrwellan tags: v1.80, v1.8005
08:22
Yet another db in transaction problem fixed. check-in: 95b6039c92 user: matt tags: v1.80

Modified db.scm from [1553c753c1] to [ee9b271883].

90
91
92
93
94
95
96
97

98
99
100
101
102
103
104
90
91
92
93
94
95
96

97
98
99
100
101
102
103
104







-
+







	  (db:hoh-set! dat key1 key2 val)))))

(define (db:hoh-get dat key1 key2)
  (let* ((subhash (hash-table-ref/default dat key1 #f)))
    (and subhash
	 (hash-table-ref/default subhash key2 #f))))

(define (db:get-cache-stmth dbdat run-id db stmt)
(define (db:get-cache-stmth dbdat db stmt)
  (let* (;; (dbdat       (dbfile:get-dbdat dbstruct run-id))
	 (stmt-cache  (dbr:dbdat-stmt-cache dbdat))
	 (stmth       (db:hoh-get stmt-cache db stmt)))
    (or stmth
	(let* ((newstmth (sqlite3:prepare db stmt)))
	  (db:hoh-set! stmt-cache db stmt newstmth)
	  newstmth))))
1111
1112
1113
1114
1115
1116
1117
1118

1119
1120
1121
1122
1123

1124
1125
1126
1127
1128

1129
1130
1131
1132
1133
1134
1135
1111
1112
1113
1114
1115
1116
1117

1118
1119
1120
1121
1122

1123
1124
1125
1126
1127

1128
1129
1130
1131
1132
1133
1134
1135







-
+




-
+




-
+







    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct run-id #f
     (lambda (dbdat db)
       (let* ((stmth1 (db:get-cache-stmth
		       dbdat run-id db
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('RUNNING');"))
	      (stmth2 (db:get-cache-stmth
		       dbdat run-id db
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('REMOTEHOSTSTART');"))
	      (stmth3 (db:get-cache-stmth
		       dbdat run-id db
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path FROM tests
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                          AND state IN ('LAUNCHED');")))
	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
	 ;;
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
1357
1358
1359
1360
1361
1362
1363
1364
1365

1366
1367
1368
1369
1370
1371
1372
1357
1358
1359
1360
1361
1362
1363


1364
1365
1366
1367
1368
1369
1370
1371







-
-
+







  (let* ((res      #f))
    (db:with-db
     dbstruct #f #f  ;; for the moment vars are only stored in main.db
     (lambda (dbdat db)
       (sqlite3:for-each-row
        (lambda (val)
          (set! res val))
        db
        "SELECT val FROM metadat WHERE var=?;" var)
        (db:get-cache-stmth dbdat db "SELECT val FROM metadat WHERE var=?;") var)
       ;; convert to number if can
       (if (string? res)
           (let ((valnum (string->number res)))
             (if valnum (set! res valnum))))
       res))))

(define (db:inc-var dbstruct var)
1390
1391
1392
1393
1394
1395
1396
1397


1398
1399
1400
1401
1402
1403
1404
1389
1390
1391
1392
1393
1394
1395

1396
1397
1398
1399
1400
1401
1402
1403
1404







-
+
+







;; 	(begin
;; 	  (debug:print-info 4 *default-log-port* "launch throttle factor=" *global-delta*)
;; 	  (set! *last-global-delta-printed* *global-delta*)))

(define (db:set-var dbstruct var val)
  (db:with-db dbstruct #f #t 
	      (lambda (dbdat db)
		(sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" var val))))
		(sqlite3:execute (db:get-cache-stmth dbdat db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);")
				 var val))))

(define (db:add-var dbstruct var val)
  (db:with-db dbstruct #f #t 
	      (lambda (dbdat db)
		(sqlite3:execute db "UPDATE metadat SET val=val+? WHERE var=?;" val var))))

(define (db:del-var dbstruct var)
1812
1813
1814
1815
1816
1817
1818
1819
1820


1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831


1832
1833
1834
1835
1836
1837
1838
1812
1813
1814
1815
1816
1817
1818


1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829


1830
1831
1832
1833
1834
1835
1836
1837
1838







-
-
+
+









-
-
+
+







   dbstruct
   #f
   #f

   (lambda (dbdat db)
     ;; remove previous data
     
     (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;"))
	    (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);"))
     (let* ((stmt1 (db:get-cache-stmth dbdat db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;"))
	    (stmt2 (db:get-cache-stmth dbdat db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);"))
	    (res
	     (sqlite3:with-transaction
	      db
	      (lambda ()
		(for-each
		 (lambda (dat)
		   (sqlite3:execute stmt1 run-id (car dat)(cadr dat))
		   (apply sqlite3:execute stmt2 run-id dat))
		 stats)))))
       (sqlite3:finalize! stmt1)
       (sqlite3:finalize! stmt2)
       ;; (sqlite3:finalize! stmt1)
       ;; (sqlite3:finalize! stmt2)
       ;; (mutex-unlock! *db-transaction-mutex*)
       res))))

(define (db:get-main-run-stats dbstruct run-id)
  (db:with-db
   dbstruct
   #f ;; this data comes from main
2430
2431
2432
2433
2434
2435
2436
2437

2438

2439

2440
2441
2442

2443
2444
2445

2446
2447
2448
2449



2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465

2466
2467
2468
2469
2470
2471
2472
2430
2431
2432
2433
2434
2435
2436

2437
2438
2439

2440
2441
2442

2443
2444
2445

2446
2447



2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465

2466
2467
2468
2469
2470
2471
2472
2473







-
+

+
-
+


-
+


-
+

-
-
-
+
+
+















-
+







;;      NOTE: run-id is not used
;; ;;
(define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
  (db:with-db
   dbstruct
   run-id #f
   (lambda (dbdat db)
     (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment))))
     (db:test-set-state-status-db dbdat db run-id test-id newstate newstatus newcomment))))

;; dbdat needed for cached prepared statements
(define (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment)
(define (db:test-set-state-status-db dbdat db run-id test-id newstate newstatus newcomment)
  (cond
   ((and newstate newstatus newcomment)
    (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
    (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;") newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
		     test-id))
   ((and newstate newstatus)
    (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id))
    (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET state=?,status=? WHERE id=?;") newstate newstatus test-id))
   (else
    (if newstate   (sqlite3:execute db "UPDATE tests SET state=?   WHERE id=?;" newstate   test-id))
    (if newstatus  (sqlite3:execute db "UPDATE tests SET status=?  WHERE id=?;" newstatus  test-id))
    (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment)
    (if newstate   (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET state=?   WHERE id=?;") newstate   test-id))
    (if newstatus  (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET status=?  WHERE id=?;") newstatus  test-id))
    (if newcomment (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET comment=? WHERE id=?;") newcomment ;; (sdb:qry 'getid newcomment)
				    test-id))))
  ;; (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NOTE: Moved into calling function
  )

;; NEW BEHAVIOR: Count tests running in all runs!
;;
(define (db:get-count-tests-running dbstruct run-id) ;; fastmode)
  (let* ((qry ;; (if fastmode
		;;   "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;"
		  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")) ;; )
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let* ((stmth (db:get-cache-stmth dbdat run-id db qry)))
     (let* ((stmth (db:get-cache-stmth dbdat db qry)))
       (sqlite3:first-result stmth))))))

;; NEW BEHAVIOR: Count tests running in only one run!
;;
(define (db:get-count-tests-actually-running dbstruct run-id)
  (db:with-db
   dbstruct
2488
2489
2490
2491
2492
2493
2494
2495

2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508

2509
2510
2511
2512
2513
2514
2515
2489
2490
2491
2492
2493
2494
2495

2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508

2509
2510
2511
2512
2513
2514
2515
2516







-
+












-
+







		 ;;  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;"
		  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) ;; )
    (db:with-db
     dbstruct
     run-id
     #f
     (lambda (dbdat db)
       (let* ((stmth (db:get-cache-stmth dbdat run-id db qry)))
       (let* ((stmth (db:get-cache-stmth dbdat db qry)))
	 (sqlite3:first-result stmth run-id))))))

;; For a given testname how many items are running? Used to determine
;; probability for regenerating html
;;
(define (db:get-count-tests-running-for-testname dbstruct run-id testname)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")
	    (stmth (db:get-cache-stmth dbdat run-id db stmt)))
	    (stmth (db:get-cache-stmth dbdat db stmt)))
       (sqlite3:first-result
	stmth run-id testname)))))

(define (db:get-not-completed-cnt dbstruct run-id)
(db:with-db
   dbstruct
   run-id
2797
2798
2799
2800
2801
2802
2803
2804
2805


2806
2807
2808
2809
2810
2811
2812
2798
2799
2800
2801
2802
2803
2804


2805
2806
2807
2808
2809
2810
2811
2812
2813







-
-
+
+







(define (db:teststep-set-status! dbstruct run-id test-id teststep-name state-in status-in comment logfile)
  (db:with-db
   dbstruct
   run-id
   #t
   (lambda (dbdat db)
     (sqlite3:execute 
      db
      "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);"
      (db:get-cache-stmth dbdat db
			  "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);")
      test-id teststep-name state-in status-in (current-seconds)
      (if comment comment "")
      (if logfile logfile "")))))



(define (db:delete-steps-for-test! dbstruct run-id test-id)
2875
2876
2877
2878
2879
2880
2881
2882

2883
2884
2885
2886
2887
2888
2889
2876
2877
2878
2879
2880
2881
2882

2883
2884
2885
2886
2887
2888
2889
2890







-
+







(define (db:get-data-info-by-id dbstruct run-id test-data-id)
  (let* ((stmt        "SELECT id,test_id, category, variable, value, expected, tol, units, comment, status, type, last_update FROM test_data WHERE id=? ORDER BY id ASC;")) ;; event_time DESC,id ASC;
    (db:with-db
     dbstruct
     run-id
     #f
     (lambda (dbdat db)
       (let* ((stmth (db:get-cache-stmth dbdat #f db stmt))
       (let* ((stmth (db:get-cache-stmth dbdat db stmt))
	      (res   (sqlite3:fold-row
		      (lambda (res id test-id  category variable value expected tol units comment status type last-update)
			(vector id test-id  category variable value expected tol units comment status type last-update))
		      (vector #f #f #f #f #f #f #f #f #f #f #f #f)
		      stmth
		      test-data-id)))
        res)))))
3223
3224
3225
3226
3227
3228
3229
3230

3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245

3246
3247
3248
3249
3250
3251
3252
3224
3225
3226
3227
3228
3229
3230

3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245

3246
3247
3248
3249
3250
3251
3252
3253







-
+














-
+







     dbstruct run-id #f
     (lambda (dbdat db)
       (let ((tr-res
              (sqlite3:with-transaction
               db
               (lambda ()
                 ;; NB// Pass the db so it is part fo the transaction
                 (db:test-set-state-status-db db run-id test-id state status comment) ;; this call sets the item state/status
                 (db:test-set-state-status-db dbdat db run-id test-id state status comment) ;; this call sets the item state/status
                 (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
                     (let* ((state-status-counts (db:get-all-state-status-counts-for-test db run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
			    (state-statuses      (db:roll-up-rules state-status-counts state status))
                            (newstate            (car state-statuses))
                            (newstatus           (cadr state-statuses)))
		       (set! new-state-eh newstate)
		       (set! new-status-eh newstatus)
                       (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
				    (apply conc
					   (map (lambda (x)
						  (conc
                     				   (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
						state-status-counts))); end debug:print
		       (if tl-test-id
			   (db:test-set-state-status-db db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
			   (db:test-set-state-status-db dbdat db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
		       ))))))
         (mutex-unlock! *db-transaction-mutex*)
         (if (and test-id state status (equal? status "AUTO")) 
             (db:test-data-rollup dbstruct run-id test-id status))
	 (if new-state-eh ;; moved from db:test-set-state-status
	      (mt:process-triggers dbstruct run-id test-id new-state-eh new-status-eh))
         tr-res)))))
3585
3586
3587
3588
3589
3590
3591
3592

3593
3594
3595
3596
3597
3598
3599
3586
3587
3588
3589
3590
3591
3592

3593
3594
3595
3596
3597
3598
3599
3600







-
+







				       (string->symbol stmtname)
				       stmtname)
				   db:queries)))
 		 (if q (car q) #f))))
    (db:with-db
     dbstruct run-id #f
     (lambda (dbdat db)
       (apply sqlite3:execute db query params)
       (apply sqlite3:execute (db:get-cache-stmth dbdat db query) params)
       #t))))

;; get a summary of state and status counts to calculate a rollup
;;
(define (db:get-state-status-summary dbstruct run-id testname)
  (let ((res   '()))
    (db:with-db
4649
4650
4651
4652
4653
4654
4655

4656

4657
4658
4659
4660
4661
4662
4663
4650
4651
4652
4653
4654
4655
4656
4657

4658
4659
4660
4661
4662
4663
4664
4665







+
-
+







					  (sqlite3:finalize! db #t)
					  ;; (vector-set! *task-db* 0 #f)
					  (set! *task-db* #f)))))
			      (if (and (not (args:get-arg "-server"))
				       *runremote*)
				  (begin
				    (debug:print-info 0 *default-log-port* "Closing all client connections...")
				    (http-transport:close-connections *runremote*)
				    (http-client#close-all-connections!)))
				    #;(http-client#close-all-connections!)))
                              ;; (if (and *runremote*
                              ;;          (remote-conndat *runremote*))
                              ;;     (begin
                              ;;       (http-client#close-all-connections!))) ;; for http-client
                              (if (not (eq? *default-log-port* (current-error-port)))
                                  (close-output-port *default-log-port*))
			      (set! *default-log-port* (current-error-port))) "Cleanup db exit thread"))

Modified http-transport.scm from [fc1493c705] to [01e370881e].

294
295
296
297
298
299
300
301


302
303
304
305
306
307
308
294
295
296
297
298
299
300

301
302
303
304
305
306
307
308
309







-
+
+







						;;    (remote-conndat-set! runremote #f))
						;; Killing associated server to allow clean retry.")
						;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
						(mutex-unlock! *http-mutex*)
						;; (signal (make-composite-condition
						;;          (make-property-condition 'commfail 'message "failed to connect to server")))
						;; "communications failed"
						(close-all-connections!)
						;; (close-all-connections!)
						(close-connection! fullurl)
						(db:obj->string #f))
					      (with-input-from-request ;; was dat
					       fullurl 
					       (list (cons 'key (or server-id   "thekey"))
						     (cons 'cmd cmd)
						     (cons 'params sparams))
					       read-string))
355
356
357
358
359
360
361


362
363

364
365
366
367
368
369
370
356
357
358
359
360
361
362
363
364
365

366
367
368
369
370
371
372
373







+
+

-
+







    (if (vector? server-dat)
	(let ((api-dat (http-transport:server-dat-get-api-uri server-dat)))
	  (handle-exceptions
	    exn
	    (begin
	      (print-call-chain *default-log-port*)
	      (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
	    (if (args:any-defined? "-server" "-execute" "-run")
		(debug:print-info 0 *default-log-port* "Closing connections to "api-dat))
	    (close-connection! api-dat)
            ;; (close-idle-connections!)
	    (close-connection! (http-transport:server-dat-make-url server-dat))
	    (remote-conndat-set! runremote #f)
	    #t))
	#f)))


(define (make-http-transport:server-dat)(make-vector 6))
(define (http-transport:server-dat-get-iface         vec)    (vector-ref  vec 0))

Modified launch.scm from [9881087e2c] to [fc0040c21d].

220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246




247
248
249
250
251
252
253
254
255
256
257

258
259
260
261
262
263
264
265
266
267


268
269

270
271
272
273
274
275
276
277
220
221
222
223
224
225
226

227
228
229
230
231
232
233
234
235
236
237
238
239
240
241




242
243
244
245
246
247

248
249
250
251
252
253
254

255
256
257
258
259
260
261
262
263


264
265


266

267
268
269
270
271
272
273







-















-
-
-
-
+
+
+
+


-







-
+








-
-
+
+
-
-
+
-







    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
                                              (get-df (current-directory))
                                              disk-free))
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-info-by-id run-id test-id))
             (state       (db:test-get-state test-info))
             (status      (db:test-get-status test-info))
	     ;; instead of looking for KILLREQ we are looking for a file KILLREQUEST - see tests.scm test-get-kill-request
             ;; (test-info   (rmt:get-test-info-by-id run-id test-id))
             ;; (state       (db:test-get-state test-info))
             ;; (status      (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
         #;((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          (with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
            (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          ;; (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
	  )
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this

Modified rmt.scm from [53cc395259] to [c1074b692d].

40
41
42
43
44
45
46
47

48
49

50
51
52
53
54
55
56
40
41
42
43
44
45
46

47


48
49
50
51
52
53
54
55







-
+
-
-
+







;;======================================================================
;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;; if a server is either running or in the process of starting call client:setup
;; else return #f to let the calling proc know that there is no server available
;;
(define (rmt:get-connection-info areapath #!key (area-dat #f)) ;; TODO: push areapath down.
(define (rmt:get-connection-info areapath runremote) ;; TODO: push areapath down.
  (let* ((runremote (or area-dat *runremote*))
	 (cinfo     (if (remote? runremote)
  (let* ((cinfo     (if (remote? runremote)
			(remote-conndat runremote)
			#f)))
	  (if cinfo
	      cinfo
	      (if (server:check-if-running areapath)
		  (client:setup areapath)
		  #f))))
260
261
262
263
264
265
266
267

268
269
270
271
272
273
274
259
260
261
262
263
264
265

266
267
268
269
270
271
272
273







-
+







	       (not (remote-conndat runremote)))
	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
	       (not (remote-conndat runremote))))           ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
      (mutex-unlock! *rmt-mutex*)
      (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
	  (server:start-and-wait *toppath*))
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as

     ;;DOT CASE10 [label="on homehost"];
     ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
     ;;DOT CASE10 -> "rmt:open-qry-close-locally";
     ;; all set up if get this far, dispatch the query
     ((and (not (remote-force-server runremote))

Modified tests.scm from [5c2006972a] to [20e6047a42].

1964
1965
1966
1967
1968
1969
1970


1971

1972
1973
1974
1975
1976
1977
1978
1964
1965
1966
1967
1968
1969
1970
1971
1972

1973
1974
1975
1976
1977
1978
1979
1980







+
+
-
+







;;======================================================================
;; test steps
;;======================================================================

;; teststep-set-status! used to be here

(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
  (or (file-exists? (conc (getenv "MT_TEST_RUN_DIR")"/KILLREQUEST"))
      (file-exists? (conc (getenv "MT_LINKTREE")"/"(getenv "MT_TARGET")"/"(getenv "MT_RUNNAME")"/KILLREQUEST")))
  (let* ((testdat   (rmt:get-test-info-by-id run-id test-id)))
  #;(let* ((testdat   (rmt:get-test-info-by-id run-id test-id)))
    (and testdat
	 (equal? (test:get-state testdat) "KILLREQ"))))

(define (test:tdb-get-rundat-count tdb)
  (if tdb
      (let ((res 0))
	(sqlite3:for-each-row