Megatest

Check-in [b5df408582]
Login
Overview
Comment:Throttle writes more than reads using mutex, avoid doing sync when writes are happening using mutex. This is a monte carlo approach. Might need tweaking.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: b5df40858288274a70f7f68111b2ebde503ae2a8
User & Date: matt on 2023-04-11 21:32:21
Other Links: branch diff | manifest | tags
Context
2023-04-12
06:38
Refactored to move most of the work to scan and recover dead tests out of the servers. This commit also fixes dependencies in the Makefile. check-in: c75b58ded7 user: matt tags: v1.80
2023-04-11
21:32
Throttle writes more than reads using mutex, avoid doing sync when writes are happening using mutex. This is a monte carlo approach. Might need tweaking. check-in: b5df408582 user: matt tags: v1.80
18:25
Switch default cache db to inmem from tmp check-in: aadd589324 user: matt tags: v1.80
Changes

Modified db.scm from [2ad1867042] to [18cf960c23].

142
143
144
145
146
147
148
149

150
151
152
153
154
155
156
157
158
159
160

161
162
163
164
165
166
167
142
143
144
145
146
147
148

149
150
151
152
153
154
155
156
157
158
159

160
161
162
163
164
165
166
167







-
+










-
+








;; moved from dbfile
;;
;; ADD run-id SUPPORT
;;
(define (db:create-all-triggers dbstruct)
  (db:with-db
   dbstruct #f #f
   dbstruct #f #t
   (lambda (dbdat db)
     (db:create-triggers db))))

(define (db:create-triggers db)
    (for-each (lambda (key)
              (sqlite3:execute db (cadr key)))
          db:trigger-list))

(define (db:drop-all-triggers dbstruct)
  (db:with-db
   dbstruct #f #f
   dbstruct #f #t
   (lambda (dbdat db)
     (db:drop-triggers db))))

(define (db:have-incompletes? dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
1016
1017
1018
1019
1020
1021
1022
1023

1024
1025
1026
1027
1028
1029
1030
1016
1017
1018
1019
1020
1021
1022

1023
1024
1025
1026
1027
1028
1029
1030







-
+








;; The "archived" field in tests is overloaded; 0 = not archived, > 0 archived in block with given id
;;
(define (db:test-set-archive-block-id dbstruct run-id test-id archive-block-id)
  (db:with-db
   dbstruct
   run-id
   #f
   #t
   (lambda (dbdat db)
     (sqlite3:execute db "UPDATE tests SET archived=? WHERE id=?;"
		      archive-block-id test-id))))
 
;; Look up the archive block info given a block-id
;;
(define (db:test-get-archive-block-info dbstruct archive-block-id)
1143
1144
1145
1146
1147
1148
1149
1150
1151


1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195


1196
1197
1198
1199
1200
1201
1202

1203
1204
1205
1206
1207
1208

1209
1210
1211

1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
















1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272




























1273
1274
1275
1276
1277
1278
1279
1143
1144
1145
1146
1147
1148
1149


1150
1151
1152
1153
1154









































1155
1156







1157






1158



1159
































1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175





























1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210







-
-
+
+



-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
-
-
-
-
-
-
-
+
-
-
-
-
-
-
+
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







         (server-overloaded-budget 200)
         (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
         (launch-monitor-on-time-budget 30)
         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
         (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
         )
         (running-deadtime (or deadtime-trim running-deadtime-default))) ;; two minutes (30 seconds between updates, this leaves 3x grace period)

    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct run-id #f
     (lambda (dbdat db)
       (let* ((stmth1 (db:get-cache-stmth
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('RUNNING');"))
	      (stmth2 (db:get-cache-stmth
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('REMOTEHOSTSTART');"))
	      (stmth3 (db:get-cache-stmth
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path FROM tests
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                          AND state IN ('LAUNCHED');")))
	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
	 ;;
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
       
    (let* ((dat (db:get-toplevels-and-incompletes dbstruct run-id running-deadtime remotehoststart-deadtime)))
      (set! oldlaunched (list-ref dat 1))
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
      (set! toplevels   (list-ref dat 2))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
				    " run-duration="run-duration)
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
      (set! incompleted (list-ref dat 0)))
	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 

	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 ;;
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row
	  (lambda (test-id run-dir uname testname item-path)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
				    " 1 day since event_time marked")
                (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
	  stmth3
	  run-id)
       
	 (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
			   (length toplevels) " old LAUNCHED toplevel tests and "
			   (length incompleted) " tests marked RUNNING but apparently dead."))

	 ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
       ;;
	 ;; (db:delay-if-busy dbdat)
	 (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
		(all-ids             (append min-incompleted-ids (map car oldlaunched))))
	   (if (> (length all-ids) 0)
	       (begin
		 ;; (launch:is-test-alive "localhost" 435)
		 (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
			      " as DEAD")
		 (for-each
                  (lambda (test-id)
    (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
		      (length toplevels) " old LAUNCHED toplevel tests and "
		      (length incompleted) " tests marked RUNNING but apparently dead.")
  
    ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
    ;;
    ;; (db:delay-if-busy dbdat)
    (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
	   (all-ids             (append min-incompleted-ids (map car oldlaunched))))
      (if (> (length all-ids) 0)
	  (begin
	    ;; (launch:is-test-alive "localhost" 435)
	    (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
			 " as DEAD")
	    (for-each
             (lambda (test-id)
                    (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
			   (tinfo   (db:get-test-info-by-id dbstruct run-id test-id))
			   (run-dir (db:test-get-rundir     tinfo))
			   (host    (db:test-get-host       tinfo))
			   (pid     (db:test-get-process_id tinfo))
			   (result (db:get-status-from-final-status-file run-dir)))
		      (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
			  (begin
			    (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
			    (db:set-state-status-and-roll-up-items
			     dbstruct run-id test-id 'foo "COMPLETED" "PASS"
			     "Test stopped responding but it has PASSED; marking it PASS in the DB."))
			  (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
					       (launch:is-test-alive host pid))))
			    (if is-alive
				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
					     " has a process on pid " pid ", NOT setting to DEAD.")
				(begin
				  (debug:print 0 *default-log-port* "INFO: test " test-id
					       " final state/status is not COMPLETED/PASS. It is " result)
				  (db:set-state-status-and-roll-up-items
				   dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
				   "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
		  ;; call end of eud of run detection for posthook - from merge, is it needed?
		  ;; (launch:end-of-run-check run-id)
		  all-ids)
		 ;;call end of eud of run detection for posthook
		 (launch:end-of-run-check run-id)
		 )))))))
               (let* ((tinfo   (db:get-test-info-by-id dbstruct run-id test-id))
		      (run-dir (db:test-get-rundir     tinfo))
		      (host    (db:test-get-host       tinfo))
		      (pid     (db:test-get-process_id tinfo))
		      (result (db:get-status-from-final-status-file run-dir)))
		 (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
		     (begin
		       (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
		       (db:set-state-status-and-roll-up-items
			dbstruct run-id test-id 'foo "COMPLETED" "PASS"
			"Test stopped responding but it has PASSED; marking it PASS in the DB."))
		     (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
					  (launch:is-test-alive host pid))))
		       (if is-alive
			   (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
					" has a process on pid " pid ", NOT setting to DEAD.")
			   (begin
			     (debug:print 0 *default-log-port* "INFO: test " test-id
					  " final state/status is not COMPLETED/PASS. It is " result)
			     (db:set-state-status-and-roll-up-items
			      dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
			      "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
	     ;; call end of eud of run detection for posthook - from merge, is it needed?
	     ;; (launch:end-of-run-check run-id)
	     all-ids)
	    ;;call end of eud of run detection for posthook
	    (launch:end-of-run-check run-id)
	    )))))

;; BUG: Probably broken - does not explicitly use run-id in the query
;;
(define (db:top-test-set-per-pf-counts dbstruct run-id test-name)
  (db:general-call dbstruct run-id 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name)))

;; Clean out old junk and vacuum the database
1525
1526
1527
1528
1529
1530
1531
1532

1533
1534
1535
1536
1537
1538
1539
1456
1457
1458
1459
1460
1461
1462

1463
1464
1465
1466
1467
1468
1469
1470







-
+







	 (qryvals   (append (list runname) (map cadr keyvals)))
	 (key=?str  (string-intersperse (map (lambda (k)(conc k "=?")) keys) " AND ")))
    ;; (debug:print 0 *default-log-port* "Got here 0.")
    (debug:print 3 *default-log-port* "keys: " keys " allvals: " allvals " keyvals: " keyvals " key=?str is " key=?str)
    (debug:print 2 *default-log-port* "NOTE: using target " (string-intersperse (map cadr keyvals) "/") " for this run")
    (if (and runname (null? (filter (lambda (x)(not x)) keyvals))) ;; there must be a better way to "apply and"
	(db:with-db
	 dbstruct #f #f
	 dbstruct #f #t
	 (lambda (dbdat db)
	   ;; (debug:print 0 *default-log-port* "Got here 1.")
	   (let ((res #f))
	     (apply sqlite3:execute db
		    (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour"
			  comma keystr ") VALUES (?,?,?,?,strftime('%s','now'),?" comma valslots ");")
		    allvals)
1574
1575
1576
1577
1578
1579
1580
1581

1582
1583
1584
1585
1586
1587
1588
1505
1506
1507
1508
1509
1510
1511

1512
1513
1514
1515
1516
1517
1518
1519







-
+







			  db qrystr runname)
			 res))))
      (if (null? runs)
	  (db:create-initial-run-record dbstruct runname target))
      (let* ((run-id (db:get-run-id dbstruct runname target)))
	(db:with-db
	 dbstruct
	 #f #f
	 #f #t
	 (lambda (dbdat db)
	   (for-each
	    (lambda (keyval)
	      (let* ((fieldname (car keyval))
		     (getqry    (conc "SELECT "fieldname" FROM runs WHERE id=?;"))
		     (setqry    (conc "UPDATE runs SET "fieldname"=? WHERE id=?;"))
		     (val       (cdr keyval))
1603
1604
1605
1606
1607
1608
1609
1610

1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629

1630
1631
1632
1633
1634
1635
1636
1534
1535
1536
1537
1538
1539
1540

1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559

1560
1561
1562
1563
1564
1565
1566
1567







-
+


















-
+







(define (db:create-initial-run-record dbstruct runname target)	  
  (let* ((keys     (db:get-keys dbstruct))
     	 (targvals (string-split target "/"))
	 (keystr   (string-intersperse keys ","))
	 (key?str  (string-intersperse (make-list (length targvals) "?") ","))
	 (qrystr   (conc "INSERT INTO runs (runname,"keystr") VALUES (?,"key?str")")))
    (db:with-db
     dbstruct #f #f
     dbstruct #f #t
     (lambda (dbdat db)
       (apply sqlite3:execute db qrystr runname targvals)))))

(define (db:insert-test dbstruct run-id test-rec)
  (let* ((testname  (alist-ref "testname" test-rec equal?))
	 (item-path (alist-ref "item_path" test-rec equal?))
	 (id        (db:get-test-id dbstruct run-id testname item-path))
	 (fieldvals (filter (lambda (x)(not (member (car x) '("id" "last_update")))) test-rec))
	 (setqry    (conc "UPDATE tests SET "(string-intersperse
					      (map (lambda (dat)
						     (conc (car dat)"=?"))
						   fieldvals)
					      ",")" WHERE id=?;"))
	 (insqry   (conc "INSERT INTO tests ("(string-intersperse (map (lambda (x) (car x)) fieldvals) ",")
			 ") VALUES ("(string-intersperse (make-list (length fieldvals) "?") ",")");")))
    (debug:print 0 *default-log-port* "id: "id"\nset: "setqry"\ninsqry: "insqry)
    (db:with-db
     dbstruct
     run-id #f
     run-id #t
     (lambda (dbdat db)
       (if id
	   (apply sqlite3:execute db setqry (append (map cdr fieldvals) (list id)))
	   (apply sqlite3:execute db insqry (map cdr fieldvals)))))))

;; replace header and keystr with a call to runs:get-std-run-fields
;;
1834
1835
1836
1837
1838
1839
1840
1841

1842
1843
1844
1845
1846
1847
1848
1849
1765
1766
1767
1768
1769
1770
1771

1772

1773
1774
1775
1776
1777
1778
1779







-
+
-







;; input data is a list (state status count)
;;
(define (db:update-run-stats dbstruct run-id stats)
  ;; (mutex-lock! *db-transaction-mutex*)
  (db:with-db
   dbstruct
   #f
   #f
   #t

   (lambda (dbdat db)
     ;; remove previous data
     
     (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;"))
	    (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);"))
	    (res
	     (sqlite3:with-transaction
2044
2045
2046
2047
2048
2049
2050
2051

2052
2053
2054
2055
2056
2057
2058
1974
1975
1976
1977
1978
1979
1980

1981
1982
1983
1984
1985
1986
1987
1988







-
+







   (lambda (dbdat db)
     (sqlite3:execute db "UPDATE runs SET comment=? WHERE id=?;" comment ;; (sdb:qry 'getid comment)
		      run-id))))

;; does not (obviously!) removed dependent data. But why not!!?
(define (db:delete-run dbstruct run-id)
  (db:with-db
   dbstruct #f #f
   dbstruct #f #t
   (lambda (dbdat db)
     (sqlite3:with-transaction
      db
      (lambda ()
        (sqlite3:execute db "DELETE FROM test_steps WHERE test_id IN (SELECT id FROM tests WHERE run_id=?);" run-id)
        (sqlite3:execute db "DELETE FROM test_data WHERE test_id IN (SELECT id FROM tests WHERE run_id=?);"  run-id)
        (sqlite3:execute db "UPDATE tests SET state='DELETED',comment='' WHERE run_id=?;" run-id)
2076
2077
2078
2079
2080
2081
2082
2083

2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096

2097
2098
2099
2100
2101
2102
2103
2006
2007
2008
2009
2010
2011
2012

2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025

2026
2027
2028
2029
2030
2031
2032
2033







-
+












-
+







       (sqlite3:execute db "UPDATE runs SET state=? WHERE id=?;" newlockval run-id)
       (sqlite3:execute db "INSERT INTO access_log (user,accessed,args) VALUES(?,strftime('%s','now'),?);"
			user (conc newlockval " " run-id))
       (debug:print-info 1 *default-log-port* "" newlockval " run number " run-id)))))

(define (db:set-run-status dbstruct run-id status msg)
  (db:with-db
   dbstruct #f #f
   dbstruct #f #t
   (lambda (dbdat db)
       (if msg
         (sqlite3:execute db "UPDATE runs SET status=?,comment=? WHERE id=?;" status msg run-id)
         (sqlite3:execute db "UPDATE runs SET status=? WHERE id=?;" status run-id)))))

(define (db:set-run-state-status-db dbdat db run-id state status )
  (sqlite3:execute
   (db:get-cache-stmth
    dbdat db "UPDATE runs SET status=?,state=? WHERE id=?;") status state run-id))

(define (db:set-run-state-status dbstruct run-id state status )
  (db:with-db
   dbstruct #f #f
   dbstruct #f #t
   (lambda (dbdat db)
     (db:set-run-state-status-db dbdat db run-id state status))))
     
(define (db:get-run-status dbstruct run-id)
  (let ((res "n/a"))
    (db:with-db
     dbstruct #f #f
2410
2411
2412
2413
2414
2415
2416
2417

2418
2419
2420
2421
2422
2423
2424
2340
2341
2342
2343
2344
2345
2346

2347
2348
2349
2350
2351
2352
2353
2354







-
+







;; Convert calling routines to get list of run-ids and loop, do not use the get-tests-for-runs
;;

(define (db:delete-test-records dbstruct run-id test-id)
  (db:general-call dbstruct run-id 'delete-test-step-records (list test-id))
  (db:general-call dbstruct run-id 'delete-test-data-records (list test-id))
  (db:with-db
   dbstruct run-id #f
   dbstruct run-id #t
   (lambda (dbdat db)
     (sqlite3:execute db "UPDATE tests SET state='DELETED',status='n/a',comment='' WHERE id=?;" test-id))))

;; 
(define (db:delete-old-deleted-test-records dbstruct)
  (let ((targtime (- (current-seconds)
		     (or (configf:lookup-number *configdat* "setup" "keep-deleted-records")
2474
2475
2476
2477
2478
2479
2480
2481

2482
2483
2484
2485
2486
2487
2488
2404
2405
2406
2407
2408
2409
2410

2411
2412
2413
2414
2415
2416
2417
2418







-
+







;; ;; NB// Ultimately this will be deprecated in deference to mt:test-set-state-status-by-id
;;
;;      NOTE: run-id is not used
;; ;;
(define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
  (db:with-db
   dbstruct
   run-id #f
   run-id #t
   (lambda (dbdat db)
     (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment))))

(define (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment)
  (cond
   ((and newstate newstatus newcomment)
    (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
2627
2628
2629
2630
2631
2632
2633
2634

2635
2636
2637
2638
2639
2640
2641
2557
2558
2559
2560
2561
2562
2563

2564
2565
2566
2567
2568
2569
2570
2571







-
+







;; overload the unused attemptnum field for the process id of the runscript or 
;; ezsteps step script in progress
;;
(define (db:test-set-top-process-pid dbstruct run-id test-id pid)
  (db:with-db
   dbstruct
   run-id
   #f
   #t
   (lambda (dbdat db)
     (sqlite3:execute db "UPDATE tests SET attemptnum=? WHERE id=?;"
		      pid test-id))))

(define (db:test-get-top-process-pid dbstruct run-id test-id)
  (db:with-db
   dbstruct
2666
2667
2668
2669
2670
2671
2672
2673

2674
2675
2676
2677
2678
2679
2680
2596
2597
2598
2599
2600
2601
2602

2603
2604
2605
2606
2607
2608
2609
2610







-
+







		#f
		(loop (car tal)(cdr tal)(+ indx 1)))))))

(define db:test-record-qry-selector (string-intersperse db:test-record-fields ","))

(define (db:update-tesdata-on-repilcate-db dbstruct old-lt new-lt)
  (db:with-db
   dbstruct   #f   #f
   dbstruct   #f   #t
   (lambda (dbdat db)
     (sqlite3:execute db "UPDATE tests SET rundir= replace(rundir,?,?), shortdir=replace(shortdir,?,?);"
		      old-lt new-lt  old-lt new-lt))))

;; NOTE: Use db:test-get* to access records
;; NOTE: This needs rundir decoding? Decide, decode here or where used? For the moment decode where used.
(define (db:get-all-tests-info-by-run-id dbstruct run-id)
2974
2975
2976
2977
2978
2979
2980
2981

2982
2983
2984
2985
2986
2987
2988
2904
2905
2906
2907
2908
2909
2910

2911
2912
2913
2914
2915
2916
2917
2918







-
+







;; look at the test_data status field, 
;;    if all are pass (any case) and the test status is PASS or NULL or '' then set test status to PASS.
;;    if one or more are fail (any case) then set test status to PASS, non "pass" or "fail" are ignored
(define (db:test-data-rollup dbstruct run-id test-id status)
  (let* ((fail-count 0)
	 (pass-count 0))
    (db:with-db
     dbstruct run-id #f
     dbstruct run-id #t
     (lambda (dbdat db)
       (sqlite3:for-each-row
	(lambda (fcount pcount)
	  (set! fail-count fcount)
	  (set! pass-count pcount))
	db 
	"SELECT (SELECT count(id) FROM test_data WHERE test_id=? AND status like 'fail') AS fail_count,
3073
3074
3075
3076
3077
3078
3079
3080

3081
3082
3083
3084
3085
3086
3087
3003
3004
3005
3006
3007
3008
3009

3010
3011
3012
3013
3014
3015
3016
3017







-
+







;; foo,bra,   1.2, pass, silly stuff
;; faz,bar,    10,  8mA,     ,     ,"this is a comment"
;; EOF

(define (db:csv->test-data dbstruct run-id test-id csvdata)
  (debug:print 4 *default-log-port* "test-id " test-id ", csvdata: " csvdata)
  (db:with-db
   dbstruct #f #f
   dbstruct #f #t
   (lambda (dbdat db)
     (let* ((csvlist (csv->list (make-csv-reader
				 (open-input-string csvdata)
				 '((strip-leading-whitespace? #t)
				   (strip-trailing-whitespace? #t)))))) ;; (csv->list csvdata)))
       (for-each
	(lambda (csvrow)
3300
3301
3302
3303
3304
3305
3306
3307

3308
3309
3310
3311
3312
3313
3314
3230
3231
3232
3233
3234
3235
3236

3237
3238
3239
3240
3241
3242
3243
3244







-
+







			   #f))
	 (new-state-eh #f)
	 (new-status-eh #f))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) 
	(db:general-call dbstruct run-id 'set-test-start-time (list test-id)))
    (mutex-lock! *db-transaction-mutex*)
    (db:with-db
     dbstruct run-id #f
     dbstruct run-id #t
     (lambda (dbdat db)
       (let ((tr-res
              (sqlite3:with-transaction
               db
               (lambda ()
                 ;; NB// Pass the db so it is part fo the transaction
                 (db:test-set-state-status-db db run-id test-id state status comment) ;; this call sets the item state/status
3392
3393
3394
3395
3396
3397
3398
3399

3400
3401
3402
3403
3404
3405
3406
3322
3323
3324
3325
3326
3327
3328

3329
3330
3331
3332
3333
3334
3335
3336







-
+







	
	;; NB// Pass the db so it is part of the transaction
	(list newstate newstatus))))

(define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status)
    (mutex-lock! *db-transaction-mutex*)
    (db:with-db
     dbstruct run-id #f
     dbstruct run-id #t
     (lambda (dbdat db)
       (let ((tr-res
              (sqlite3:with-transaction
               db
               (lambda ()
                   (let* ((state-status-counts (db:get-all-state-status-counts-for-run-db dbdat db run-id))
			  (state-statuses      (db:roll-up-rules state-status-counts #f #f ))
3662
3663
3664
3665
3666
3667
3668


3669
3670
3671
3672
3673
3674
3675
3676
3677

3678
3679
3680
3681
3682
3683
3684
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608

3609
3610
3611
3612
3613
3614
3615
3616







+
+








-
+







   ((not (equal? megatest-version calling-version))
    (list #t (conc "Login warning due to mismatch megatest version: " calling-version ", " megatest-version)))
   
    (else
    (hash-table-set! *logged-in-clients* client-signature (current-seconds))
    '(#t "successful login"))))

;; NO WAY TO KNOW IF IT MODIFIES THE DB BUT NEARLY ALL ARE UPDATES/INSERTS
;;
(define (db:general-call dbstruct run-id stmtname params)
  ;; Why is db:lookup-query above not used here to get the query?
  (let ((query (let ((q (alist-ref (if (string? stmtname)
				       (string->symbol stmtname)
				       stmtname)
				   db:queries)))
 		 (if q (car q) #f))))
    (db:with-db
     dbstruct run-id #f
     dbstruct run-id #t
     (lambda (dbdat db)
       (apply sqlite3:execute db query params)
       #t))))

;; get a summary of state and status counts to calculate a rollup
;;
(define (db:get-state-status-summary dbstruct run-id testname)
3882
3883
3884
3885
3886
3887
3888
3889

3890
3891
3892
3893
3894
3895
3896
3897

3898
3899
3900
3901
3902
3903
3904
3814
3815
3816
3817
3818
3819
3820

3821
3822
3823
3824
3825
3826
3827
3828

3829
3830
3831
3832
3833
3834
3835
3836







-
+







-
+







	db
	"SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;"
	testname)
       res))))

;; create a new record for a given testname
(define (db:testmeta-add-record dbstruct testname)
  (db:with-db dbstruct #f #f 
  (db:with-db dbstruct #f #t
	      (lambda (dbdat db)
		(sqlite3:execute 
		 db
		 "INSERT OR IGNORE INTO test_meta (testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags) VALUES (?,'','','','','','','','');" testname))))

;; update one of the testmeta fields
(define (db:testmeta-update-field dbstruct testname field value)
  (db:with-db dbstruct #f #f 
  (db:with-db dbstruct #f #t
	      (lambda (dbdat db)
		(sqlite3:execute 
		 db
		 (conc "UPDATE test_meta SET " field "=? WHERE testname=?;") value testname))))

(define (db:testmeta-get-all dbstruct)
  (db:with-db dbstruct #f #f 

Modified dbfile.scm from [e57985b7a3] to [1443b07658].

51
52
53
54
55
56
57
58

59
60
61
62
63
64
65
51
52
53
54
55
56
57

58
59
60
61
62
63
64
65







-
+







;;
(define dbfile:testsuite-name (make-parameter #f))

(define keep-age-param        (make-parameter 10))      ;; qif file age, if over move to attic
(define num-run-dbs           (make-parameter 10))      ;; number of db's in .mtdb
(define dbfile:sync-method    (make-parameter 'attach)) ;; 'attach or 'original
(define dbfile:cache-method   (make-parameter 'inmem))  ;; 'direct
(define dbcache-mode (make-parameter 'inmem)) ;; 'inmem, 'tmp (changes what open inmem routine does)
(define dbcache-mode (make-parameter 'tmp)) ;; 'inmem, 'tmp (changes what open inmem routine does)


;; 'original     - use old condition code
;; 'suicide-mode - create mtrah/stop-the-train with info on what went wrong
;; else use no condition code (should be production mode)
;;
(define no-condition-db-with-db (make-parameter 'suicide-mode))

Modified dbmod.scm from [96c421107b] to [297c5db2ed].

81
82
83
84
85
86
87

88
89




90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111


112
113
114
115
116
117
118
81
82
83
84
85
86
87
88


89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112


113
114
115
116
117
118
119
120
121







+
-
-
+
+
+
+




















-
-
+
+







	  (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct)
	  newdbstruct))))

;;======================================================================
;; The inmem one-db file per server method goes in here
;;======================================================================

;; NOTE: the r/w is now w/r, #t=db modified by query, #f=db NOT modified by query
(define (dbmod:with-db dbstruct run-id r/w proc params)
  (let* ((use-mutex (> *api-process-request-count* 50))
(define (dbmod:with-db dbstruct run-id w/r proc params)
  (let* ((use-mutex (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk
			     (> *api-process-request-count* 5)) ;; when writes are happening throttle more
			(> *api-process-request-count* 50)))
	 (dbdat     (dbmod:open-db dbstruct run-id (dbfile:db-init-proc)))
	 (dbh       (dbr:dbdat-dbh dbdat)) ;; this will be the inmem handle
	 (dbfile    (dbr:dbdat-dbfile dbdat)))
    ;; if nfs mode do a sync if delta > 2
    (let* ((last-update (dbr:dbstruct-last-update dbstruct))
	   (sync-proc   (dbr:dbstruct-sync-proc dbstruct))
	   (curr-secs   (current-seconds)))
      (if (> (- curr-secs last-update) 3)
	  (begin
	    (sync-proc last-update)

	    ;; MOVE THIS CALL TO INSIDE THE sync-proc CALL
	    (dbr:dbstruct-last-update-set! dbstruct curr-secs)

	    )))
    (if use-mutex (mutex-lock! *db-with-db-mutex*))
    (let* ((res (apply proc dbdat dbh params)))
      (if use-mutex (mutex-unlock! *db-with-db-mutex*))
      res)))

(define (db:with-db dbstruct run-id r/w proc . params)
  (dbmod:with-db dbstruct run-id r/w proc params))
(define (db:with-db dbstruct run-id w/r proc . params)
  (dbmod:with-db dbstruct run-id w/r proc params))

(define (dbmod:open-inmem-db init-proc #!optional (dbfullname #f))
  (let* ((db      (if dbfullname
		      (dbmod:safely-open-db dbfullname init-proc #t)
		      (sqlite3:open-database ":memory:")))
	 (handler (sqlite3:make-busy-timeout 3600)))
    (sqlite3:set-busy-handler! db handler)
170
171
172
173
174
175
176
177

178
179
180
181
182
183
184
173
174
175
176
177
178
179

180
181
182
183
184
185
186
187







-
+







			    (syncdir 'todisk))
  (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
	 (dbfname      (or dbfname-in (dbmod:run-id->dbfname run-id)))
	 (dbpath       (dbmod:get-dbdir dbstruct))             ;; directory where all the .db files are kept
	 (dbfullname   (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
	 (dbexists     (file-exists? dbfullname))
	 (tmpdir       (conc "/tmp/"(current-user-name)))
	 (tmpdb        (let* ((fname (conc tmpdir"/"(current-process-id)"-"dbfname)))
	 (tmpdb        (let* ((fname (conc tmpdir"/" (string-translate areapath "/" ".")"-"(current-process-id)"-"dbfname)))
			 (if (not (file-exists? tmpdir))(create-directory tmpdir))
			 ;; check if tmpdb already exists, either delete it or
			 ;; add something to the name
			 fname))
	 (inmem        (dbmod:open-inmem-db init-proc
					    (if (eq? (dbcache-mode) 'inmem)
						#f
197
198
199
200
201
202
203
204

205

206
207
208

209
210
211
212
213
214
215
200
201
202
203
204
205
206

207
208
209
210
211
212
213
214
215
216
217
218
219
220







-
+

+



+







    (dbr:dbstruct-inmem-set!     dbstruct inmem)
    (dbr:dbstruct-ondiskdb-set!  dbstruct db)
    (dbr:dbstruct-dbfile-set!    dbstruct dbfullname)
    (dbr:dbstruct-dbfname-set!   dbstruct dbfname)
    (dbr:dbstruct-sync-proc-set! dbstruct
				 (lambda (last-update)
				   (if *sync-in-progress*
				       (debug:print 0 *default-log-port* "WARNING: overlapping calls to sync to disk")
				       (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk")
				       (begin
					 (mutex-lock! *db-with-db-mutex*) ;; this mutex is used when overloaded or during a query that modifies the db
					 (set! *sync-in-progress* #t)
					 (dbmod:sync-gasket tables last-update inmem db
							    dbfullname syncdir)
					 (mutex-unlock! *db-with-db-mutex*)
					 (thread-sleep! 0.5) ;; ensure at least 1/2 second down time between sync calls
					 (set! *sync-in-progress* #f)))))
    ;; (dbmod:sync-tables tables #f db inmem)
    ;; (if db
    (dbmod:sync-gasket tables #f inmem db dbfullname 'fromdest) ;; ) ;; load into inmem
    (dbr:dbstruct-last-update-set! dbstruct (current-seconds)) ;; should this be offset back in time by one second?
    dbstruct))
667
668
669
670
671
672
673











































674
675
676

































672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721



722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
		(sqlite3:execute  (db:get-cache-stmth dbdat db "UPDATE metadat SET val=val+? WHERE var=?;") val var))))

(define (db:del-var dbstruct var)
  (db:with-db dbstruct #f #t 
	      (lambda (dbdat db)
		(sqlite3:execute  (db:get-cache-stmth dbdat db "DELETE FROM metadat WHERE var=?;") var))))

(define (db:get-toplevels-and-incompletes dbstruct run-id running-deadtime remotehoststart-deadtime)
  (let* ((toplevels   '())
	 (oldlaunched '())
	 (incompleted '()))
    (db:with-db 
     dbstruct run-id #f
     (lambda (dbdat db)
       (let* ((stmth1 (db:get-cache-stmth
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('RUNNING');"))
	      (stmth2 (db:get-cache-stmth
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('REMOTEHOSTSTART');"))
	      (stmth3 (db:get-cache-stmth
		       dbdat db
		       "SELECT id,rundir,uname,testname,item_path FROM tests
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                          AND state IN ('LAUNCHED');")))
	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
	 ;;
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds


)
	 
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
				    " run-duration="run-duration)
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 
	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 (sqlite3:for-each-row
	  (lambda (test-id run-dir uname testname item-path)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
				    " 1 day since event_time marked")
                  (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
	  stmth3
	  run-id))))
    (list incompleted oldlaunched toplevels)))

)