Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -144,11 +144,11 @@ ;; ;; ADD run-id SUPPORT ;; (define (db:create-all-triggers dbstruct) (db:with-db - dbstruct #f #f + dbstruct #f #t (lambda (dbdat db) (db:create-triggers db)))) (define (db:create-triggers db) (for-each (lambda (key) @@ -155,11 +155,11 @@ (sqlite3:execute db (cadr key))) db:trigger-list)) (define (db:drop-all-triggers dbstruct) (db:with-db - dbstruct #f #f + dbstruct #f #t (lambda (dbdat db) (db:drop-triggers db)))) (define (db:have-incompletes? dbstruct run-id ovr-deadtime) (let* ((incompleted '()) @@ -1018,11 +1018,11 @@ ;; (define (db:test-set-archive-block-id dbstruct run-id test-id archive-block-id) (db:with-db dbstruct run-id - #f + #t (lambda (dbdat db) (sqlite3:execute db "UPDATE tests SET archived=? WHERE id=?;" archive-block-id test-id)))) ;; Look up the archive block info given a block-id @@ -1145,133 +1145,64 @@ (launch-monitor-on-time-budget 30) (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget)) (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30)) (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default)) (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period))) - (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period) - ) + (running-deadtime (or deadtime-trim running-deadtime-default))) ;; two minutes (30 seconds between updates, this leaves 3x grace period) + (debug:print-info 4 *default-log-port* "running-deadtime = " running-deadtime) (debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim) - (db:with-db - dbstruct run-id #f - (lambda (dbdat db) - (let* ((stmth1 (db:get-cache-stmth - dbdat db - "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests - WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) - AND state IN ('RUNNING');")) - (stmth2 (db:get-cache-stmth - dbdat db - "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests - WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) - AND state IN ('REMOTEHOSTSTART');")) - (stmth3 (db:get-cache-stmth - dbdat db - "SELECT id,rundir,uname,testname,item_path FROM tests - WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 - AND state IN ('LAUNCHED');"))) - ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes - ;; - ;; HOWEVER: this code in run:test seems to work fine - ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) - ;; (db:test-get-run_duration testdat))) - ;; 600) - ;; (db:delay-if-busy dbdat) - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) - (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" - test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) - " event-time="event-time" run-duration="run-duration)))) - stmth1 - run-id running-deadtime) ;; default time 720 seconds - - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id - " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time - " run-duration="run-duration) - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) - stmth2 - run-id remotehoststart-deadtime) ;; default time 230 seconds - - ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config - ;; - ;; (db:delay-if-busy dbdat) - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (begin - (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id - " 1 day since event_time marked") - (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))) - stmth3 - run-id) - - (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " - (length toplevels) " old LAUNCHED toplevel tests and " - (length incompleted) " tests marked RUNNING but apparently dead.")) - - ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE. - ;; - ;; (db:delay-if-busy dbdat) - (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all - (all-ids (append min-incompleted-ids (map car oldlaunched)))) - (if (> (length all-ids) 0) - (begin - ;; (launch:is-test-alive "localhost" 435) - (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") - " as DEAD") - (for-each - (lambda (test-id) - (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) - (tinfo (db:get-test-info-by-id dbstruct run-id test-id)) - (run-dir (db:test-get-rundir tinfo)) - (host (db:test-get-host tinfo)) - (pid (db:test-get-process_id tinfo)) - (result (db:get-status-from-final-status-file run-dir))) - (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") - (db:set-state-status-and-roll-up-items - dbstruct run-id test-id 'foo "COMPLETED" "PASS" - "Test stopped responding but it has PASSED; marking it PASS in the DB.")) - (let ((is-alive (and (not (eq? pid 0)) ;; 0 is default in re-used field "attemptnum" where pid stored. - (launch:is-test-alive host pid)))) - (if is-alive - (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host - " has a process on pid " pid ", NOT setting to DEAD.") - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id - " final state/status is not COMPLETED/PASS. It is " result) - (db:set-state-status-and-roll-up-items - dbstruct run-id test-id 'foo "COMPLETED" "DEAD" - "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) - ;; call end of eud of run detection for posthook - from merge, is it needed? - ;; (launch:end-of-run-check run-id) - all-ids) - ;;call end of eud of run detection for posthook - (launch:end-of-run-check run-id) - ))))))) + (let* ((dat (db:get-toplevels-and-incompletes dbstruct run-id running-deadtime remotehoststart-deadtime))) + (set! oldlaunched (list-ref dat 1)) + (set! toplevels (list-ref dat 2)) + (set! incompleted (list-ref dat 0))) + + (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " + (length toplevels) " old LAUNCHED toplevel tests and " + (length incompleted) " tests marked RUNNING but apparently dead.") + + ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE. + ;; + ;; (db:delay-if-busy dbdat) + (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all + (all-ids (append min-incompleted-ids (map car oldlaunched)))) + (if (> (length all-ids) 0) + (begin + ;; (launch:is-test-alive "localhost" 435) + (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") + " as DEAD") + (for-each + (lambda (test-id) + (let* ((tinfo (db:get-test-info-by-id dbstruct run-id test-id)) + (run-dir (db:test-get-rundir tinfo)) + (host (db:test-get-host tinfo)) + (pid (db:test-get-process_id tinfo)) + (result (db:get-status-from-final-status-file run-dir))) + (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") + (db:set-state-status-and-roll-up-items + dbstruct run-id test-id 'foo "COMPLETED" "PASS" + "Test stopped responding but it has PASSED; marking it PASS in the DB.")) + (let ((is-alive (and (not (eq? pid 0)) ;; 0 is default in re-used field "attemptnum" where pid stored. + (launch:is-test-alive host pid)))) + (if is-alive + (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host + " has a process on pid " pid ", NOT setting to DEAD.") + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id + " final state/status is not COMPLETED/PASS. It is " result) + (db:set-state-status-and-roll-up-items + dbstruct run-id test-id 'foo "COMPLETED" "DEAD" + "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) + ;; call end of eud of run detection for posthook - from merge, is it needed? + ;; (launch:end-of-run-check run-id) + all-ids) + ;;call end of eud of run detection for posthook + (launch:end-of-run-check run-id) + ))))) ;; BUG: Probably broken - does not explicitly use run-id in the query ;; (define (db:top-test-set-per-pf-counts dbstruct run-id test-name) (db:general-call dbstruct run-id 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name))) @@ -1527,11 +1458,11 @@ ;; (debug:print 0 *default-log-port* "Got here 0.") (debug:print 3 *default-log-port* "keys: " keys " allvals: " allvals " keyvals: " keyvals " key=?str is " key=?str) (debug:print 2 *default-log-port* "NOTE: using target " (string-intersperse (map cadr keyvals) "/") " for this run") (if (and runname (null? (filter (lambda (x)(not x)) keyvals))) ;; there must be a better way to "apply and" (db:with-db - dbstruct #f #f + dbstruct #f #t (lambda (dbdat db) ;; (debug:print 0 *default-log-port* "Got here 1.") (let ((res #f)) (apply sqlite3:execute db (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour" @@ -1576,11 +1507,11 @@ (if (null? runs) (db:create-initial-run-record dbstruct runname target)) (let* ((run-id (db:get-run-id dbstruct runname target))) (db:with-db dbstruct - #f #f + #f #t (lambda (dbdat db) (for-each (lambda (keyval) (let* ((fieldname (car keyval)) (getqry (conc "SELECT "fieldname" FROM runs WHERE id=?;")) @@ -1605,11 +1536,11 @@ (targvals (string-split target "/")) (keystr (string-intersperse keys ",")) (key?str (string-intersperse (make-list (length targvals) "?") ",")) (qrystr (conc "INSERT INTO runs (runname,"keystr") VALUES (?,"key?str")"))) (db:with-db - dbstruct #f #f + dbstruct #f #t (lambda (dbdat db) (apply sqlite3:execute db qrystr runname targvals))))) (define (db:insert-test dbstruct run-id test-rec) (let* ((testname (alist-ref "testname" test-rec equal?)) @@ -1624,11 +1555,11 @@ (insqry (conc "INSERT INTO tests ("(string-intersperse (map (lambda (x) (car x)) fieldvals) ",") ") VALUES ("(string-intersperse (make-list (length fieldvals) "?") ",")");"))) (debug:print 0 *default-log-port* "id: "id"\nset: "setqry"\ninsqry: "insqry) (db:with-db dbstruct - run-id #f + run-id #t (lambda (dbdat db) (if id (apply sqlite3:execute db setqry (append (map cdr fieldvals) (list id))) (apply sqlite3:execute db insqry (map cdr fieldvals))))))) @@ -1836,12 +1767,11 @@ (define (db:update-run-stats dbstruct run-id stats) ;; (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f - #f - + #t (lambda (dbdat db) ;; remove previous data (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;")) (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);")) @@ -2046,11 +1976,11 @@ run-id)))) ;; does not (obviously!) removed dependent data. But why not!!? (define (db:delete-run dbstruct run-id) (db:with-db - dbstruct #f #f + dbstruct #f #t (lambda (dbdat db) (sqlite3:with-transaction db (lambda () (sqlite3:execute db "DELETE FROM test_steps WHERE test_id IN (SELECT id FROM tests WHERE run_id=?);" run-id) @@ -2078,11 +2008,11 @@ user (conc newlockval " " run-id)) (debug:print-info 1 *default-log-port* "" newlockval " run number " run-id))))) (define (db:set-run-status dbstruct run-id status msg) (db:with-db - dbstruct #f #f + dbstruct #f #t (lambda (dbdat db) (if msg (sqlite3:execute db "UPDATE runs SET status=?,comment=? WHERE id=?;" status msg run-id) (sqlite3:execute db "UPDATE runs SET status=? WHERE id=?;" status run-id))))) @@ -2091,11 +2021,11 @@ (db:get-cache-stmth dbdat db "UPDATE runs SET status=?,state=? WHERE id=?;") status state run-id)) (define (db:set-run-state-status dbstruct run-id state status ) (db:with-db - dbstruct #f #f + dbstruct #f #t (lambda (dbdat db) (db:set-run-state-status-db dbdat db run-id state status)))) (define (db:get-run-status dbstruct run-id) (let ((res "n/a")) @@ -2412,11 +2342,11 @@ (define (db:delete-test-records dbstruct run-id test-id) (db:general-call dbstruct run-id 'delete-test-step-records (list test-id)) (db:general-call dbstruct run-id 'delete-test-data-records (list test-id)) (db:with-db - dbstruct run-id #f + dbstruct run-id #t (lambda (dbdat db) (sqlite3:execute db "UPDATE tests SET state='DELETED',status='n/a',comment='' WHERE id=?;" test-id)))) ;; (define (db:delete-old-deleted-test-records dbstruct) @@ -2476,11 +2406,11 @@ ;; NOTE: run-id is not used ;; ;; (define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment) (db:with-db dbstruct - run-id #f + run-id #t (lambda (dbdat db) (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment)))) (define (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment) (cond @@ -2629,11 +2559,11 @@ ;; (define (db:test-set-top-process-pid dbstruct run-id test-id pid) (db:with-db dbstruct run-id - #f + #t (lambda (dbdat db) (sqlite3:execute db "UPDATE tests SET attemptnum=? WHERE id=?;" pid test-id)))) (define (db:test-get-top-process-pid dbstruct run-id test-id) @@ -2668,11 +2598,11 @@ (define db:test-record-qry-selector (string-intersperse db:test-record-fields ",")) (define (db:update-tesdata-on-repilcate-db dbstruct old-lt new-lt) (db:with-db - dbstruct #f #f + dbstruct #f #t (lambda (dbdat db) (sqlite3:execute db "UPDATE tests SET rundir= replace(rundir,?,?), shortdir=replace(shortdir,?,?);" old-lt new-lt old-lt new-lt)))) ;; NOTE: Use db:test-get* to access records @@ -2976,11 +2906,11 @@ ;; if one or more are fail (any case) then set test status to PASS, non "pass" or "fail" are ignored (define (db:test-data-rollup dbstruct run-id test-id status) (let* ((fail-count 0) (pass-count 0)) (db:with-db - dbstruct run-id #f + dbstruct run-id #t (lambda (dbdat db) (sqlite3:for-each-row (lambda (fcount pcount) (set! fail-count fcount) (set! pass-count pcount)) @@ -3075,11 +3005,11 @@ ;; EOF (define (db:csv->test-data dbstruct run-id test-id csvdata) (debug:print 4 *default-log-port* "test-id " test-id ", csvdata: " csvdata) (db:with-db - dbstruct #f #f + dbstruct #f #t (lambda (dbdat db) (let* ((csvlist (csv->list (make-csv-reader (open-input-string csvdata) '((strip-leading-whitespace? #t) (strip-trailing-whitespace? #t)))))) ;; (csv->list csvdata))) @@ -3302,11 +3232,11 @@ (new-status-eh #f)) (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) (db:general-call dbstruct run-id 'set-test-start-time (list test-id))) (mutex-lock! *db-transaction-mutex*) (db:with-db - dbstruct run-id #f + dbstruct run-id #t (lambda (dbdat db) (let ((tr-res (sqlite3:with-transaction db (lambda () @@ -3394,11 +3324,11 @@ (list newstate newstatus)))) (define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status) (mutex-lock! *db-transaction-mutex*) (db:with-db - dbstruct run-id #f + dbstruct run-id #t (lambda (dbdat db) (let ((tr-res (sqlite3:with-transaction db (lambda () @@ -3664,19 +3594,21 @@ (else (hash-table-set! *logged-in-clients* client-signature (current-seconds)) '(#t "successful login")))) +;; NO WAY TO KNOW IF IT MODIFIES THE DB BUT NEARLY ALL ARE UPDATES/INSERTS +;; (define (db:general-call dbstruct run-id stmtname params) ;; Why is db:lookup-query above not used here to get the query? (let ((query (let ((q (alist-ref (if (string? stmtname) (string->symbol stmtname) stmtname) db:queries))) (if q (car q) #f)))) (db:with-db - dbstruct run-id #f + dbstruct run-id #t (lambda (dbdat db) (apply sqlite3:execute db query params) #t)))) ;; get a summary of state and status counts to calculate a rollup @@ -3884,19 +3816,19 @@ testname) res)))) ;; create a new record for a given testname (define (db:testmeta-add-record dbstruct testname) - (db:with-db dbstruct #f #f + (db:with-db dbstruct #f #t (lambda (dbdat db) (sqlite3:execute db "INSERT OR IGNORE INTO test_meta (testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags) VALUES (?,'','','','','','','','');" testname)))) ;; update one of the testmeta fields (define (db:testmeta-update-field dbstruct testname field value) - (db:with-db dbstruct #f #f + (db:with-db dbstruct #f #t (lambda (dbdat db) (sqlite3:execute db (conc "UPDATE test_meta SET " field "=? WHERE testname=?;") value testname)))) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -53,11 +53,11 @@ (define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic (define num-run-dbs (make-parameter 10)) ;; number of db's in .mtdb (define dbfile:sync-method (make-parameter 'attach)) ;; 'attach or 'original (define dbfile:cache-method (make-parameter 'inmem)) ;; 'direct -(define dbcache-mode (make-parameter 'inmem)) ;; 'inmem, 'tmp (changes what open inmem routine does) +(define dbcache-mode (make-parameter 'tmp)) ;; 'inmem, 'tmp (changes what open inmem routine does) ;; 'original - use old condition code ;; 'suicide-mode - create mtrah/stop-the-train with info on what went wrong ;; else use no condition code (should be production mode) Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -83,12 +83,15 @@ ;;====================================================================== ;; The inmem one-db file per server method goes in here ;;====================================================================== -(define (dbmod:with-db dbstruct run-id r/w proc params) - (let* ((use-mutex (> *api-process-request-count* 50)) +;; NOTE: the r/w is now w/r, #t=db modified by query, #f=db NOT modified by query +(define (dbmod:with-db dbstruct run-id w/r proc params) + (let* ((use-mutex (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk + (> *api-process-request-count* 5)) ;; when writes are happening throttle more + (> *api-process-request-count* 50))) (dbdat (dbmod:open-db dbstruct run-id (dbfile:db-init-proc))) (dbh (dbr:dbdat-dbh dbdat)) ;; this will be the inmem handle (dbfile (dbr:dbdat-dbfile dbdat))) ;; if nfs mode do a sync if delta > 2 (let* ((last-update (dbr:dbstruct-last-update dbstruct)) @@ -105,12 +108,12 @@ (if use-mutex (mutex-lock! *db-with-db-mutex*)) (let* ((res (apply proc dbdat dbh params))) (if use-mutex (mutex-unlock! *db-with-db-mutex*)) res))) -(define (db:with-db dbstruct run-id r/w proc . params) - (dbmod:with-db dbstruct run-id r/w proc params)) +(define (db:with-db dbstruct run-id w/r proc . params) + (dbmod:with-db dbstruct run-id w/r proc params)) (define (dbmod:open-inmem-db init-proc #!optional (dbfullname #f)) (let* ((db (if dbfullname (dbmod:safely-open-db dbfullname init-proc #t) (sqlite3:open-database ":memory:"))) @@ -172,11 +175,11 @@ (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (dbpath (dbmod:get-dbdir dbstruct)) ;; directory where all the .db files are kept (dbfullname (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id)) (dbexists (file-exists? dbfullname)) (tmpdir (conc "/tmp/"(current-user-name))) - (tmpdb (let* ((fname (conc tmpdir"/"(current-process-id)"-"dbfname))) + (tmpdb (let* ((fname (conc tmpdir"/" (string-translate areapath "/" ".")"-"(current-process-id)"-"dbfname))) (if (not (file-exists? tmpdir))(create-directory tmpdir)) ;; check if tmpdb already exists, either delete it or ;; add something to the name fname)) (inmem (dbmod:open-inmem-db init-proc @@ -199,15 +202,17 @@ (dbr:dbstruct-dbfile-set! dbstruct dbfullname) (dbr:dbstruct-dbfname-set! dbstruct dbfname) (dbr:dbstruct-sync-proc-set! dbstruct (lambda (last-update) (if *sync-in-progress* - (debug:print 0 *default-log-port* "WARNING: overlapping calls to sync to disk") + (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk") (begin + (mutex-lock! *db-with-db-mutex*) ;; this mutex is used when overloaded or during a query that modifies the db (set! *sync-in-progress* #t) (dbmod:sync-gasket tables last-update inmem db dbfullname syncdir) + (mutex-unlock! *db-with-db-mutex*) (thread-sleep! 0.5) ;; ensure at least 1/2 second down time between sync calls (set! *sync-in-progress* #f))))) ;; (dbmod:sync-tables tables #f db inmem) ;; (if db (dbmod:sync-gasket tables #f inmem db dbfullname 'fromdest) ;; ) ;; load into inmem @@ -669,8 +674,81 @@ (define (db:del-var dbstruct var) (db:with-db dbstruct #f #t (lambda (dbdat db) (sqlite3:execute (db:get-cache-stmth dbdat db "DELETE FROM metadat WHERE var=?;") var)))) - +(define (db:get-toplevels-and-incompletes dbstruct run-id running-deadtime remotehoststart-deadtime) + (let* ((toplevels '()) + (oldlaunched '()) + (incompleted '())) + (db:with-db + dbstruct run-id #f + (lambda (dbdat db) + (let* ((stmth1 (db:get-cache-stmth + dbdat db + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests + WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) + AND state IN ('RUNNING');")) + (stmth2 (db:get-cache-stmth + dbdat db + "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests + WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) + AND state IN ('REMOTEHOSTSTART');")) + (stmth3 (db:get-cache-stmth + dbdat db + "SELECT id,rundir,uname,testname,item_path FROM tests + WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 + AND state IN ('LAUNCHED');"))) + ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes + ;; + ;; HOWEVER: this code in run:test seems to work fine + ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) + ;; (db:test-get-run_duration testdat))) + ;; 600) + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path event-time run-duration) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) + (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" + test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) + " event-time="event-time" run-duration="run-duration)))) + stmth1 + run-id running-deadtime) ;; default time 720 seconds + + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path event-time run-duration) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id + " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time + " run-duration="run-duration) + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) + stmth2 + run-id remotehoststart-deadtime) ;; default time 230 seconds + + ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (begin + (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id + " 1 day since event_time marked") + (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))) + stmth3 + run-id)))) + (list incompleted oldlaunched toplevels))) )