Megatest

Changes On Branch ccccfdb50a36f1c8
Login

Changes In Branch v1.80-disconnect-tests Excluding Merge-Ins

This is equivalent to a diff from d310aee913 to ccccfdb50a

2023-01-16
03:32
Isolated fix from v1.80-disconnected-tests for TCP ports issue (not tested yet) check-in: da2bac2043 user: matt tags: v1.80
03:21
Merged v1.80 Leaf check-in: ccccfdb50a user: matt tags: v1.80-disconnect-tests
2023-01-15
21:45
Appear to have fixed the growing connections issue check-in: f78750d901 user: matt tags: v1.80-disconnect-tests
2023-01-13
13:37
I guess close-idle-connections is not available? don't get it but getting rid of it... check-in: d310aee913 user: mrwellan tags: v1.80, v1.8005
08:22
Yet another db in transaction problem fixed. check-in: 95b6039c92 user: matt tags: v1.80

Modified db.scm from [1553c753c1] to [ee9b271883].

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
	  (db:hoh-set! dat key1 key2 val)))))

(define (db:hoh-get dat key1 key2)
  (let* ((subhash (hash-table-ref/default dat key1 #f)))
    (and subhash
	 (hash-table-ref/default subhash key2 #f))))

(define (db:get-cache-stmth dbdat run-id db stmt)
  (let* (;; (dbdat       (dbfile:get-dbdat dbstruct run-id))
	 (stmt-cache  (dbr:dbdat-stmt-cache dbdat))
	 (stmth       (db:hoh-get stmt-cache db stmt)))
    (or stmth
	(let* ((newstmth (sqlite3:prepare db stmt)))
	  (db:hoh-set! stmt-cache db stmt newstmth)
	  newstmth))))







|







90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
	  (db:hoh-set! dat key1 key2 val)))))

(define (db:hoh-get dat key1 key2)
  (let* ((subhash (hash-table-ref/default dat key1 #f)))
    (and subhash
	 (hash-table-ref/default subhash key2 #f))))

(define (db:get-cache-stmth dbdat db stmt)
  (let* (;; (dbdat       (dbfile:get-dbdat dbstruct run-id))
	 (stmt-cache  (dbr:dbdat-stmt-cache dbdat))
	 (stmth       (db:hoh-get stmt-cache db stmt)))
    (or stmth
	(let* ((newstmth (sqlite3:prepare db stmt)))
	  (db:hoh-set! stmt-cache db stmt newstmth)
	  newstmth))))
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct run-id #f
     (lambda (dbdat db)
       (let* ((stmth1 (db:get-cache-stmth
		       dbdat run-id db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('RUNNING');"))
	      (stmth2 (db:get-cache-stmth
		       dbdat run-id db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('REMOTEHOSTSTART');"))
	      (stmth3 (db:get-cache-stmth
		       dbdat run-id db
		       "SELECT id,rundir,uname,testname,item_path FROM tests
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                          AND state IN ('LAUNCHED');")))
	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
	 ;;
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)







|




|




|







1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct run-id #f
     (lambda (dbdat db)
       (let* ((stmth1 (db:get-cache-stmth
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('RUNNING');"))
	      (stmth2 (db:get-cache-stmth
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('REMOTEHOSTSTART');"))
	      (stmth3 (db:get-cache-stmth
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path FROM tests
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                          AND state IN ('LAUNCHED');")))
	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
	 ;;
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
  (let* ((res      #f))
    (db:with-db
     dbstruct #f #f  ;; for the moment vars are only stored in main.db
     (lambda (dbdat db)
       (sqlite3:for-each-row
        (lambda (val)
          (set! res val))
        db
        "SELECT val FROM metadat WHERE var=?;" var)
       ;; convert to number if can
       (if (string? res)
           (let ((valnum (string->number res)))
             (if valnum (set! res valnum))))
       res))))

(define (db:inc-var dbstruct var)







<
|







1357
1358
1359
1360
1361
1362
1363

1364
1365
1366
1367
1368
1369
1370
1371
  (let* ((res      #f))
    (db:with-db
     dbstruct #f #f  ;; for the moment vars are only stored in main.db
     (lambda (dbdat db)
       (sqlite3:for-each-row
        (lambda (val)
          (set! res val))

        (db:get-cache-stmth dbdat db "SELECT val FROM metadat WHERE var=?;") var)
       ;; convert to number if can
       (if (string? res)
           (let ((valnum (string->number res)))
             (if valnum (set! res valnum))))
       res))))

(define (db:inc-var dbstruct var)
1390
1391
1392
1393
1394
1395
1396
1397

1398
1399
1400
1401
1402
1403
1404
;; 	(begin
;; 	  (debug:print-info 4 *default-log-port* "launch throttle factor=" *global-delta*)
;; 	  (set! *last-global-delta-printed* *global-delta*)))

(define (db:set-var dbstruct var val)
  (db:with-db dbstruct #f #t 
	      (lambda (dbdat db)
		(sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" var val))))


(define (db:add-var dbstruct var val)
  (db:with-db dbstruct #f #t 
	      (lambda (dbdat db)
		(sqlite3:execute db "UPDATE metadat SET val=val+? WHERE var=?;" val var))))

(define (db:del-var dbstruct var)







|
>







1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
;; 	(begin
;; 	  (debug:print-info 4 *default-log-port* "launch throttle factor=" *global-delta*)
;; 	  (set! *last-global-delta-printed* *global-delta*)))

(define (db:set-var dbstruct var val)
  (db:with-db dbstruct #f #t 
	      (lambda (dbdat db)
		(sqlite3:execute (db:get-cache-stmth dbdat db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);")
				 var val))))

(define (db:add-var dbstruct var val)
  (db:with-db dbstruct #f #t 
	      (lambda (dbdat db)
		(sqlite3:execute db "UPDATE metadat SET val=val+? WHERE var=?;" val var))))

(define (db:del-var dbstruct var)
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
   dbstruct
   #f
   #f

   (lambda (dbdat db)
     ;; remove previous data
     
     (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;"))
	    (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);"))
	    (res
	     (sqlite3:with-transaction
	      db
	      (lambda ()
		(for-each
		 (lambda (dat)
		   (sqlite3:execute stmt1 run-id (car dat)(cadr dat))
		   (apply sqlite3:execute stmt2 run-id dat))
		 stats)))))
       (sqlite3:finalize! stmt1)
       (sqlite3:finalize! stmt2)
       ;; (mutex-unlock! *db-transaction-mutex*)
       res))))

(define (db:get-main-run-stats dbstruct run-id)
  (db:with-db
   dbstruct
   #f ;; this data comes from main







|
|









|
|







1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
   dbstruct
   #f
   #f

   (lambda (dbdat db)
     ;; remove previous data
     
     (let* ((stmt1 (db:get-cache-stmth dbdat db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;"))
	    (stmt2 (db:get-cache-stmth dbdat db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);"))
	    (res
	     (sqlite3:with-transaction
	      db
	      (lambda ()
		(for-each
		 (lambda (dat)
		   (sqlite3:execute stmt1 run-id (car dat)(cadr dat))
		   (apply sqlite3:execute stmt2 run-id dat))
		 stats)))))
       ;; (sqlite3:finalize! stmt1)
       ;; (sqlite3:finalize! stmt2)
       ;; (mutex-unlock! *db-transaction-mutex*)
       res))))

(define (db:get-main-run-stats dbstruct run-id)
  (db:with-db
   dbstruct
   #f ;; this data comes from main
2430
2431
2432
2433
2434
2435
2436
2437
2438

2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
;;      NOTE: run-id is not used
;; ;;
(define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
  (db:with-db
   dbstruct
   run-id #f
   (lambda (dbdat db)
     (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment))))


(define (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment)
  (cond
   ((and newstate newstatus newcomment)
    (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
		     test-id))
   ((and newstate newstatus)
    (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id))
   (else
    (if newstate   (sqlite3:execute db "UPDATE tests SET state=?   WHERE id=?;" newstate   test-id))
    (if newstatus  (sqlite3:execute db "UPDATE tests SET status=?  WHERE id=?;" newstatus  test-id))
    (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment)
				    test-id))))
  ;; (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NOTE: Moved into calling function
  )

;; NEW BEHAVIOR: Count tests running in all runs!
;;
(define (db:get-count-tests-running dbstruct run-id) ;; fastmode)
  (let* ((qry ;; (if fastmode
		;;   "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;"
		  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")) ;; )
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let* ((stmth (db:get-cache-stmth dbdat run-id db qry)))
       (sqlite3:first-result stmth))))))

;; NEW BEHAVIOR: Count tests running in only one run!
;;
(define (db:get-count-tests-actually-running dbstruct run-id)
  (db:with-db
   dbstruct







|

>
|


|


|

|
|
|















|







2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
;;      NOTE: run-id is not used
;; ;;
(define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
  (db:with-db
   dbstruct
   run-id #f
   (lambda (dbdat db)
     (db:test-set-state-status-db dbdat db run-id test-id newstate newstatus newcomment))))

;; dbdat needed for cached prepared statements
(define (db:test-set-state-status-db dbdat db run-id test-id newstate newstatus newcomment)
  (cond
   ((and newstate newstatus newcomment)
    (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;") newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
		     test-id))
   ((and newstate newstatus)
    (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET state=?,status=? WHERE id=?;") newstate newstatus test-id))
   (else
    (if newstate   (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET state=?   WHERE id=?;") newstate   test-id))
    (if newstatus  (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET status=?  WHERE id=?;") newstatus  test-id))
    (if newcomment (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE tests SET comment=? WHERE id=?;") newcomment ;; (sdb:qry 'getid newcomment)
				    test-id))))
  ;; (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NOTE: Moved into calling function
  )

;; NEW BEHAVIOR: Count tests running in all runs!
;;
(define (db:get-count-tests-running dbstruct run-id) ;; fastmode)
  (let* ((qry ;; (if fastmode
		;;   "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;"
		  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")) ;; )
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let* ((stmth (db:get-cache-stmth dbdat db qry)))
       (sqlite3:first-result stmth))))))

;; NEW BEHAVIOR: Count tests running in only one run!
;;
(define (db:get-count-tests-actually-running dbstruct run-id)
  (db:with-db
   dbstruct
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
		 ;;  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;"
		  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) ;; )
    (db:with-db
     dbstruct
     run-id
     #f
     (lambda (dbdat db)
       (let* ((stmth (db:get-cache-stmth dbdat run-id db qry)))
	 (sqlite3:first-result stmth run-id))))))

;; For a given testname how many items are running? Used to determine
;; probability for regenerating html
;;
(define (db:get-count-tests-running-for-testname dbstruct run-id testname)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")
	    (stmth (db:get-cache-stmth dbdat run-id db stmt)))
       (sqlite3:first-result
	stmth run-id testname)))))

(define (db:get-not-completed-cnt dbstruct run-id)
(db:with-db
   dbstruct
   run-id







|












|







2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
		 ;;  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;"
		  "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) ;; )
    (db:with-db
     dbstruct
     run-id
     #f
     (lambda (dbdat db)
       (let* ((stmth (db:get-cache-stmth dbdat db qry)))
	 (sqlite3:first-result stmth run-id))))))

;; For a given testname how many items are running? Used to determine
;; probability for regenerating html
;;
(define (db:get-count-tests-running-for-testname dbstruct run-id testname)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (dbdat db)
     (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")
	    (stmth (db:get-cache-stmth dbdat db stmt)))
       (sqlite3:first-result
	stmth run-id testname)))))

(define (db:get-not-completed-cnt dbstruct run-id)
(db:with-db
   dbstruct
   run-id
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
(define (db:teststep-set-status! dbstruct run-id test-id teststep-name state-in status-in comment logfile)
  (db:with-db
   dbstruct
   run-id
   #t
   (lambda (dbdat db)
     (sqlite3:execute 
      db
      "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);"
      test-id teststep-name state-in status-in (current-seconds)
      (if comment comment "")
      (if logfile logfile "")))))



(define (db:delete-steps-for-test! dbstruct run-id test-id)







|
|







2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
(define (db:teststep-set-status! dbstruct run-id test-id teststep-name state-in status-in comment logfile)
  (db:with-db
   dbstruct
   run-id
   #t
   (lambda (dbdat db)
     (sqlite3:execute 
      (db:get-cache-stmth dbdat db
			  "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);")
      test-id teststep-name state-in status-in (current-seconds)
      (if comment comment "")
      (if logfile logfile "")))))



(define (db:delete-steps-for-test! dbstruct run-id test-id)
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
(define (db:get-data-info-by-id dbstruct run-id test-data-id)
  (let* ((stmt        "SELECT id,test_id, category, variable, value, expected, tol, units, comment, status, type, last_update FROM test_data WHERE id=? ORDER BY id ASC;")) ;; event_time DESC,id ASC;
    (db:with-db
     dbstruct
     run-id
     #f
     (lambda (dbdat db)
       (let* ((stmth (db:get-cache-stmth dbdat #f db stmt))
	      (res   (sqlite3:fold-row
		      (lambda (res id test-id  category variable value expected tol units comment status type last-update)
			(vector id test-id  category variable value expected tol units comment status type last-update))
		      (vector #f #f #f #f #f #f #f #f #f #f #f #f)
		      stmth
		      test-data-id)))
        res)))))







|







2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
(define (db:get-data-info-by-id dbstruct run-id test-data-id)
  (let* ((stmt        "SELECT id,test_id, category, variable, value, expected, tol, units, comment, status, type, last_update FROM test_data WHERE id=? ORDER BY id ASC;")) ;; event_time DESC,id ASC;
    (db:with-db
     dbstruct
     run-id
     #f
     (lambda (dbdat db)
       (let* ((stmth (db:get-cache-stmth dbdat db stmt))
	      (res   (sqlite3:fold-row
		      (lambda (res id test-id  category variable value expected tol units comment status type last-update)
			(vector id test-id  category variable value expected tol units comment status type last-update))
		      (vector #f #f #f #f #f #f #f #f #f #f #f #f)
		      stmth
		      test-data-id)))
        res)))))
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
     dbstruct run-id #f
     (lambda (dbdat db)
       (let ((tr-res
              (sqlite3:with-transaction
               db
               (lambda ()
                 ;; NB// Pass the db so it is part fo the transaction
                 (db:test-set-state-status-db db run-id test-id state status comment) ;; this call sets the item state/status
                 (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
                     (let* ((state-status-counts (db:get-all-state-status-counts-for-test db run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
			    (state-statuses      (db:roll-up-rules state-status-counts state status))
                            (newstate            (car state-statuses))
                            (newstatus           (cadr state-statuses)))
		       (set! new-state-eh newstate)
		       (set! new-status-eh newstatus)
                       (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
				    (apply conc
					   (map (lambda (x)
						  (conc
                     				   (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
						state-status-counts))); end debug:print
		       (if tl-test-id
			   (db:test-set-state-status-db db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
		       ))))))
         (mutex-unlock! *db-transaction-mutex*)
         (if (and test-id state status (equal? status "AUTO")) 
             (db:test-data-rollup dbstruct run-id test-id status))
	 (if new-state-eh ;; moved from db:test-set-state-status
	      (mt:process-triggers dbstruct run-id test-id new-state-eh new-status-eh))
         tr-res)))))







|














|







3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
     dbstruct run-id #f
     (lambda (dbdat db)
       (let ((tr-res
              (sqlite3:with-transaction
               db
               (lambda ()
                 ;; NB// Pass the db so it is part fo the transaction
                 (db:test-set-state-status-db dbdat db run-id test-id state status comment) ;; this call sets the item state/status
                 (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
                     (let* ((state-status-counts (db:get-all-state-status-counts-for-test db run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
			    (state-statuses      (db:roll-up-rules state-status-counts state status))
                            (newstate            (car state-statuses))
                            (newstatus           (cadr state-statuses)))
		       (set! new-state-eh newstate)
		       (set! new-status-eh newstatus)
                       (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
				    (apply conc
					   (map (lambda (x)
						  (conc
                     				   (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
						state-status-counts))); end debug:print
		       (if tl-test-id
			   (db:test-set-state-status-db dbdat db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
		       ))))))
         (mutex-unlock! *db-transaction-mutex*)
         (if (and test-id state status (equal? status "AUTO")) 
             (db:test-data-rollup dbstruct run-id test-id status))
	 (if new-state-eh ;; moved from db:test-set-state-status
	      (mt:process-triggers dbstruct run-id test-id new-state-eh new-status-eh))
         tr-res)))))
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
				       (string->symbol stmtname)
				       stmtname)
				   db:queries)))
 		 (if q (car q) #f))))
    (db:with-db
     dbstruct run-id #f
     (lambda (dbdat db)
       (apply sqlite3:execute db query params)
       #t))))

;; get a summary of state and status counts to calculate a rollup
;;
(define (db:get-state-status-summary dbstruct run-id testname)
  (let ((res   '()))
    (db:with-db







|







3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
				       (string->symbol stmtname)
				       stmtname)
				   db:queries)))
 		 (if q (car q) #f))))
    (db:with-db
     dbstruct run-id #f
     (lambda (dbdat db)
       (apply sqlite3:execute (db:get-cache-stmth dbdat db query) params)
       #t))))

;; get a summary of state and status counts to calculate a rollup
;;
(define (db:get-state-status-summary dbstruct run-id testname)
  (let ((res   '()))
    (db:with-db
4649
4650
4651
4652
4653
4654
4655

4656
4657
4658
4659
4660
4661
4662
4663
					  (sqlite3:finalize! db #t)
					  ;; (vector-set! *task-db* 0 #f)
					  (set! *task-db* #f)))))
			      (if (and (not (args:get-arg "-server"))
				       *runremote*)
				  (begin
				    (debug:print-info 0 *default-log-port* "Closing all client connections...")

				    (http-client#close-all-connections!)))
                              ;; (if (and *runremote*
                              ;;          (remote-conndat *runremote*))
                              ;;     (begin
                              ;;       (http-client#close-all-connections!))) ;; for http-client
                              (if (not (eq? *default-log-port* (current-error-port)))
                                  (close-output-port *default-log-port*))
			      (set! *default-log-port* (current-error-port))) "Cleanup db exit thread"))







>
|







4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
					  (sqlite3:finalize! db #t)
					  ;; (vector-set! *task-db* 0 #f)
					  (set! *task-db* #f)))))
			      (if (and (not (args:get-arg "-server"))
				       *runremote*)
				  (begin
				    (debug:print-info 0 *default-log-port* "Closing all client connections...")
				    (http-transport:close-connections *runremote*)
				    #;(http-client#close-all-connections!)))
                              ;; (if (and *runremote*
                              ;;          (remote-conndat *runremote*))
                              ;;     (begin
                              ;;       (http-client#close-all-connections!))) ;; for http-client
                              (if (not (eq? *default-log-port* (current-error-port)))
                                  (close-output-port *default-log-port*))
			      (set! *default-log-port* (current-error-port))) "Cleanup db exit thread"))

Modified http-transport.scm from [fc1493c705] to [01e370881e].

294
295
296
297
298
299
300
301

302
303
304
305
306
307
308
						;;    (remote-conndat-set! runremote #f))
						;; Killing associated server to allow clean retry.")
						;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
						(mutex-unlock! *http-mutex*)
						;; (signal (make-composite-condition
						;;          (make-property-condition 'commfail 'message "failed to connect to server")))
						;; "communications failed"
						(close-all-connections!)

						(db:obj->string #f))
					      (with-input-from-request ;; was dat
					       fullurl 
					       (list (cons 'key (or server-id   "thekey"))
						     (cons 'cmd cmd)
						     (cons 'params sparams))
					       read-string))







|
>







294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
						;;    (remote-conndat-set! runremote #f))
						;; Killing associated server to allow clean retry.")
						;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
						(mutex-unlock! *http-mutex*)
						;; (signal (make-composite-condition
						;;          (make-property-condition 'commfail 'message "failed to connect to server")))
						;; "communications failed"
						;; (close-all-connections!)
						(close-connection! fullurl)
						(db:obj->string #f))
					      (with-input-from-request ;; was dat
					       fullurl 
					       (list (cons 'key (or server-id   "thekey"))
						     (cons 'cmd cmd)
						     (cons 'params sparams))
					       read-string))
355
356
357
358
359
360
361


362
363
364
365
366
367
368
369
370
    (if (vector? server-dat)
	(let ((api-dat (http-transport:server-dat-get-api-uri server-dat)))
	  (handle-exceptions
	    exn
	    (begin
	      (print-call-chain *default-log-port*)
	      (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))


	    (close-connection! api-dat)
            ;; (close-idle-connections!)
	    (remote-conndat-set! runremote #f)
	    #t))
	#f)))


(define (make-http-transport:server-dat)(make-vector 6))
(define (http-transport:server-dat-get-iface         vec)    (vector-ref  vec 0))







>
>

|







356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
    (if (vector? server-dat)
	(let ((api-dat (http-transport:server-dat-get-api-uri server-dat)))
	  (handle-exceptions
	    exn
	    (begin
	      (print-call-chain *default-log-port*)
	      (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
	    (if (args:any-defined? "-server" "-execute" "-run")
		(debug:print-info 0 *default-log-port* "Closing connections to "api-dat))
	    (close-connection! api-dat)
	    (close-connection! (http-transport:server-dat-make-url server-dat))
	    (remote-conndat-set! runremote #f)
	    #t))
	#f)))


(define (make-http-transport:server-dat)(make-vector 6))
(define (http-transport:server-dat-get-iface         vec)    (vector-ref  vec 0))

Modified launch.scm from [9881087e2c] to [fc0040c21d].

220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))
      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
                                              (get-df (current-directory))
                                              disk-free))
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time))

             (test-info   (rmt:get-test-info-by-id run-id test-id))
             (state       (db:test-get-state test-info))
             (status      (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))
        ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         ((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
	  )
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this







<















|
|
|
|


<







|








|
|
<
|
<







220
221
222
223
224
225
226

227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265

266

267
268
269
270
271
272
273
    ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)

    (let loop ((minutes   (calc-minutes))
	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
	       (disk-free (get-df (current-directory)))
               (last-sync (current-seconds)))

      (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
             (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                   (delta (abs (- load cpu-load))))
                              (if (> delta 0.1) ;; don't bother updating with small changes
                                  load
                                  #f)))
             (new-disk-free (let* ((df    (if over-time ;; only get df every 30 seconds
                                              (get-df (current-directory))
                                              disk-free))
                                   (delta (abs (- df disk-free))))
                              (if (and (> df 0)
                                       (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                  df
                                  #f)))
             (do-sync       (or new-cpu-load new-disk-free over-time))
	     ;; instead of looking for KILLREQ we are looking for a file KILLREQUEST - see tests.scm test-get-kill-request
             ;; (test-info   (rmt:get-test-info-by-id run-id test-id))
             ;; (state       (db:test-get-state test-info))
             ;; (status      (db:test-get-status test-info))
             (kill-reason  "no kill reason specified")
             (kill-job?    #f))

        (cond
         ((test-get-kill-request run-id test-id)
          (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
          (set! kill-job? #t))
         ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
          (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
          (set! kill-job? #t))
         #;((equal? status "DEAD")
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
          (with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
            (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))

          ;; (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)

	  )
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this

Modified rmt.scm from [53cc395259] to [c1074b692d].

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
;;======================================================================
;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;; if a server is either running or in the process of starting call client:setup
;; else return #f to let the calling proc know that there is no server available
;;
(define (rmt:get-connection-info areapath #!key (area-dat #f)) ;; TODO: push areapath down.
  (let* ((runremote (or area-dat *runremote*))
	 (cinfo     (if (remote? runremote)
			(remote-conndat runremote)
			#f)))
	  (if cinfo
	      cinfo
	      (if (server:check-if-running areapath)
		  (client:setup areapath)
		  #f))))







|
<
|







40
41
42
43
44
45
46
47

48
49
50
51
52
53
54
55
;;======================================================================
;;  S U P P O R T   F U N C T I O N S
;;======================================================================

;; if a server is either running or in the process of starting call client:setup
;; else return #f to let the calling proc know that there is no server available
;;
(define (rmt:get-connection-info areapath runremote) ;; TODO: push areapath down.

  (let* ((cinfo     (if (remote? runremote)
			(remote-conndat runremote)
			#f)))
	  (if cinfo
	      cinfo
	      (if (server:check-if-running areapath)
		  (client:setup areapath)
		  #f))))
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
	       (not (remote-conndat runremote)))
	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
	       (not (remote-conndat runremote))))           ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
      (mutex-unlock! *rmt-mutex*)
      (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
	  (server:start-and-wait *toppath*))
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as

     ;;DOT CASE10 [label="on homehost"];
     ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
     ;;DOT CASE10 -> "rmt:open-qry-close-locally";
     ;; all set up if get this far, dispatch the query
     ((and (not (remote-force-server runremote))







|







259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
	       (not (remote-conndat runremote)))
	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
	       (not (remote-conndat runremote))))           ;; and no connection
      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
      (mutex-unlock! *rmt-mutex*)
      (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
	  (server:start-and-wait *toppath*))
      (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http
      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as

     ;;DOT CASE10 [label="on homehost"];
     ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
     ;;DOT CASE10 -> "rmt:open-qry-close-locally";
     ;; all set up if get this far, dispatch the query
     ((and (not (remote-force-server runremote))

Modified tests.scm from [5c2006972a] to [20e6047a42].

1964
1965
1966
1967
1968
1969
1970


1971
1972
1973
1974
1975
1976
1977
1978
;;======================================================================
;; test steps
;;======================================================================

;; teststep-set-status! used to be here

(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)


  (let* ((testdat   (rmt:get-test-info-by-id run-id test-id)))
    (and testdat
	 (equal? (test:get-state testdat) "KILLREQ"))))

(define (test:tdb-get-rundat-count tdb)
  (if tdb
      (let ((res 0))
	(sqlite3:for-each-row







>
>
|







1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
;;======================================================================
;; test steps
;;======================================================================

;; teststep-set-status! used to be here

(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
  (or (file-exists? (conc (getenv "MT_TEST_RUN_DIR")"/KILLREQUEST"))
      (file-exists? (conc (getenv "MT_LINKTREE")"/"(getenv "MT_TARGET")"/"(getenv "MT_RUNNAME")"/KILLREQUEST")))
  #;(let* ((testdat   (rmt:get-test-info-by-id run-id test-id)))
    (and testdat
	 (equal? (test:get-state testdat) "KILLREQ"))))

(define (test:tdb-get-rundat-count tdb)
  (if tdb
      (let ((res 0))
	(sqlite3:for-each-row