Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -189,14 +189,17 @@ (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))) (if (not readonly-command) (mutex-lock! write-mutex)) - (let* ((res + (let* ((tmppath (dbr:dbstruct-tmppath dbstruct)) + (crumbfile (dbfile:wait-for-qif tmppath run-id (cons cmd params))) + (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") (api:dispatch-request dbstruct cmd run-id params)))) + (delete-file* crumbfile) (if (not readonly-command) (mutex-unlock! write-mutex)) ;; save all stats (let ((delta-t (- (current-milliseconds) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1008,64 +1008,64 @@ ;;====================================================================== ;; M A I N T E N A N C E ;;====================================================================== -(define (db:have-incompletes? dbstruct run-id ovr-deadtime) - (let* ((incompleted '()) - (oldlaunched '()) - (toplevels '()) - (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) - (deadtime (if (and deadtime-str - (string->number deadtime-str)) - (string->number deadtime-str) - 72000))) ;; twenty hours - (db:with-db - dbstruct run-id #f - (lambda (dbdat db) - (if (number? ovr-deadtime)(set! deadtime ovr-deadtime)) - - ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes - ;; - ;; HOWEVER: this code in run:test seems to work fine - ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) - ;; (db:test-get-run_duration testdat))) - ;; 600) - ;; (db:delay-if-busy dbdat) - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))) - (db:get-cache-stmth dbdat db - "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');") - run-id deadtime) - - ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config - ;; - ;; (db:delay-if-busy dbdat) - (sqlite3:for-each-row - (lambda (test-id run-dir uname testname item-path) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))) - db - "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');" - run-id) - - (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.") - (if (and (null? incompleted) - (null? oldlaunched) - (null? toplevels)) - #f - #t))))) +;; (define (db:have-incompletes? dbstruct run-id ovr-deadtime) +;; (let* ((incompleted '()) +;; (oldlaunched '()) +;; (toplevels '()) +;; (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) +;; (deadtime (if (and deadtime-str +;; (string->number deadtime-str)) +;; (string->number deadtime-str) +;; 72000))) ;; twenty hours +;; (db:with-db +;; dbstruct run-id #f +;; (lambda (dbdat db) +;; (if (number? ovr-deadtime)(set! deadtime ovr-deadtime)) +;; +;; ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes +;; ;; +;; ;; HOWEVER: this code in run:test seems to work fine +;; ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) +;; ;; (db:test-get-run_duration testdat))) +;; ;; 600) +;; ;; (db:delay-if-busy dbdat) +;; (sqlite3:for-each-row +;; (lambda (test-id run-dir uname testname item-path) +;; (if (and (equal? uname "n/a") +;; (equal? item-path "")) ;; this is a toplevel test +;; ;; what to do with toplevel? call rollup? +;; (begin +;; (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) +;; (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) +;; (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))) +;; (db:get-cache-stmth dbdat db +;; "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');") +;; run-id deadtime) +;; +;; ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config +;; ;; +;; ;; (db:delay-if-busy dbdat) +;; (sqlite3:for-each-row +;; (lambda (test-id run-dir uname testname item-path) +;; (if (and (equal? uname "n/a") +;; (equal? item-path "")) ;; this is a toplevel test +;; ;; what to do with toplevel? call rollup? +;; (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) +;; (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))) +;; (db:get-cache-stmth dbdat db +;; "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');") +;; run-id) +;; +;; (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.") +;; (if (and (null? incompleted) +;; (null? oldlaunched) +;; (null? toplevels)) +;; #f +;; #t))))) (define (db:get-status-from-final-status-file run-dir) (let ((infile (conc run-dir "/.final-status"))) ;; first verify we are able to write the output file (if (not (file-read-access? infile)) @@ -1104,21 +1104,21 @@ (db:with-db dbstruct run-id #f (lambda (dbdat db) (let* ((stmth1 (db:get-cache-stmth - dbdat run-id db + dbdat db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');")) (stmth2 (db:get-cache-stmth - dbdat run-id db + dbdat db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');")) (stmth3 (db:get-cache-stmth - dbdat run-id db + dbdat db "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"))) ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes ;; @@ -1383,21 +1383,21 @@ ;; (set! *last-global-delta-printed* *global-delta*))) (define (db:set-var dbstruct var val) (db:with-db dbstruct #f #t (lambda (dbdat db) - (sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" var val)))) + (sqlite3:execute (db:get-cache-stmth dbdat db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);") var val)))) (define (db:add-var dbstruct var val) (db:with-db dbstruct #f #t (lambda (dbdat db) - (sqlite3:execute db "UPDATE metadat SET val=val+? WHERE var=?;" val var)))) + (sqlite3:execute (db:get-cache-stmth dbdat db "UPDATE metadat SET val=val+? WHERE var=?;") val var)))) (define (db:del-var dbstruct var) (db:with-db dbstruct #f #t (lambda (dbdat db) - (sqlite3:execute db "DELETE FROM metadat WHERE var=?;" var)))) + (sqlite3:execute (db:get-cache-stmth dbdat db "DELETE FROM metadat WHERE var=?;") var)))) ;;====================================================================== ;; no-sync.db - small bits of data to be shared between servers ;;====================================================================== @@ -2451,11 +2451,11 @@ (db:with-db dbstruct run-id #f (lambda (dbdat db) - (let* ((stmth (db:get-cache-stmth dbdat run-id db qry))) + (let* ((stmth (db:get-cache-stmth dbdat db qry))) (sqlite3:first-result stmth)))))) ;; NEW BEHAVIOR: Count tests running in only one run! ;; (define (db:get-count-tests-actually-running dbstruct run-id) @@ -2481,11 +2481,11 @@ (db:with-db dbstruct run-id #f (lambda (dbdat db) - (let* ((stmth (db:get-cache-stmth dbdat run-id db qry))) + (let* ((stmth (db:get-cache-stmth dbdat db qry))) (sqlite3:first-result stmth run-id)))))) ;; For a given testname how many items are running? Used to determine ;; probability for regenerating html ;; @@ -2494,11 +2494,11 @@ dbstruct run-id #f (lambda (dbdat db) (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;") - (stmth (db:get-cache-stmth dbdat run-id db stmt))) + (stmth (db:get-cache-stmth dbdat db stmt))) (sqlite3:first-result stmth run-id testname))))) (define (db:get-not-completed-cnt dbstruct run-id) (db:with-db @@ -2868,11 +2868,11 @@ (db:with-db dbstruct run-id #f (lambda (dbdat db) - (let* ((stmth (db:get-cache-stmth dbdat #f db stmt)) + (let* ((stmth (db:get-cache-stmth dbdat db stmt)) (res (sqlite3:fold-row (lambda (res id test-id category variable value expected tol units comment status type last-update) (vector id test-id category variable value expected tol units comment status type last-update)) (vector #f #f #f #f #f #f #f #f #f #f #f #f) stmth Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -1004,10 +1004,48 @@ ;; (mutex-unlock! *db-open-mutex*) dbdat)) (define dbfile:db-init-proc (make-parameter #f)) +;; create a dropping near the db file in a qif dir +;; use count of such files to gate queries (queries in flight) +;; +(define (dbfile:wait-for-qif fname run-id params) + (let* ((thedir (pathname-directory fname)) + (destdir (conc thedir"/qif")) + (uniqn (get-area-path-signature (conc run-id params))) + (crumbn (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id)))) + (if (not (file-exists? destdir))(create-directory destdir #t)) + (let loop ((count 0)) + (let* ((currlks (glob (conc destdir"/*"))) + (numqrys (length currlks)) + (delayval (cond + ((> numqrys 50) + (if (> numqrys 50) + (for-each + (lambda (f) + (if (> (- (current-seconds) + (file-modification-time f)) + 10) + (begin + (dbfile:print-err "Removing qif file "f" older than 10 seconds") + (delete-file* f)))) + currlks)) + 1) + ((> numqrys 25) 0.25) + ((> numqrys 10) 0.1) + (else #f)))) + (if (and delayval + (< count 5)) + (begin + (thread-sleep! delayval) + (loop (+ count 1)))))) + (with-output-to-file crumbn + (lambda () + (print fname" "run-id" "params))) + crumbn)) + ;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;") ;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no ;; (define (db:with-db dbstruct run-id r/w proc . params) (assert dbstruct "FATAL: db:with-db called with dbstruct "#f) @@ -1021,15 +1059,12 @@ (dbr:dbdat-dbh dbdat) dbstruct)) (fname (if dbdat (dbr:dbdat-dbfile dbdat) "nofilenameavailable")) - (jfile (conc fname"-journal")) - #;(subdb (if have-struct - (dbfile:get-subdb dbstruct run-id) - #f)) - ) ;; was 25 + (jfile (conc fname"-journal"))) + ;; (crumbfile (dbfile:wait-for-qif fname run-id params))) (assert (sqlite3:database? db) "FATAL: db:with-db, db is not a database, db="db", fname="fname) (if (file-exists? jfile) (begin (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load") (thread-sleep! 0.2))) @@ -1043,10 +1078,11 @@ (let ((res (apply proc dbdat db params))) ;; the actual call is here. (if use-mutex (mutex-unlock! *db-with-db-mutex*)) ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w)) (if dbdat (dbfile:add-dbdat dbstruct run-id dbdat)) + ;; (delete-file* crumbfile) res)) (exn (io-error) (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.")) (exn (corrupt) (db:generic-error-printout exn "ERROR: database " fname " is corrupt. Repair it to proceed.")) @@ -1185,17 +1221,69 @@ (let ((res (proc))) (dbfile:simple-file-release-lock fname) res) (assert #t "FATAL: simple file lock never got a lock.")))) -(define (db:get-cache-stmth dbdat run-id db stmt) +(define (db:get-cache-stmth dbdat db stmt) (let* (;; (dbdat (dbfile:get-dbdat dbstruct run-id)) (stmt-cache (dbr:dbdat-stmt-cache dbdat)) ;; (stmth (db:hoh-get stmt-cache db stmt)) (stmth (hash-table-ref/default stmt-cache stmt #f))) (or stmth (let* ((newstmth (sqlite3:prepare db stmt))) ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) + +(define (db:have-incompletes? dbstruct run-id ovr-deadtime) + (let* ((incompleted '()) + (oldlaunched '()) + (toplevels '()) + ;; (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) + (deadtime (or ovr-deadtime 72000))) ;; twenty hours + (db:with-db + dbstruct run-id #f + (lambda (dbdat db) + + ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes + ;; + ;; HOWEVER: this code in run:test seems to work fine + ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) + ;; (db:test-get-run_duration testdat))) + ;; 600) + ;; (db:delay-if-busy dbdat) + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))) + ;; (print-info "Found old toplevel test in RUNNING state, test-id=" test-id)) + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))) + (db:get-cache-stmth dbdat db + "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');") + run-id deadtime) + + ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config + ;; + ;; (db:delay-if-busy dbdat) + (sqlite3:for-each-row + (lambda (test-id run-dir uname testname item-path) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))) + (db:get-cache-stmth dbdat db + "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');") + run-id) + + ;; (print-info "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.") + (if (and (null? incompleted) + (null? oldlaunched) + (null? toplevels)) + #f + #t))))) + )