Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -1075,10 +1075,15 @@ (begin (sqlite3:interrupt! db) (sqlite3:finalize! db #t) ;; (vector-set! *task-db* 0 #f) (set! *task-db* #f))))) + (if (and *no-sync-db* + (sqlite3:database? *no-sync-db*)) + (begin + (sqlite3:interrupt! *no-sync-db*) + (sqlite3:finalize! *no-sync-db* #t))) (http-client#close-all-connections!) ;; (if (and *runremote* ;; (remote-conndat *runremote*)) ;; (begin ;; (http-client#close-all-connections!))) ;; for http-client Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2162,14 +2162,20 @@ db)))) (mutex-unlock! *db-access-mutex*) res)) (define (db:no-sync-set db var val) - (sqlite3:execute (db:no-sync-db db) "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val)) + ;; (mutex-lock! *db-access-mutex*) + (let ((res (sqlite3:execute (db:no-sync-db db) "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val))) + ;; (mutex-unlock! *db-access-mutex*) + res)) (define (db:no-sync-del! db var) - (sqlite3:execute (db:no-sync-db db) "DELETE FROM no_sync_metadat WHERE var=?;" var)) + ;; (mutex-lock! *db-access-mutex*) + (sqlite3:execute (db:no-sync-db db) "DELETE FROM no_sync_metadat WHERE var=?;" var) + ;; (mutex-unlock! *db-access-mutex*) + ) (define (db:no-sync-get/default db var default) (let ((res default)) (sqlite3:for-each-row (lambda (val) @@ -3215,14 +3221,14 @@ test-id)))))) (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NEW BEHAVIOR: Count tests running in all runs! ;; -(define (db:get-count-tests-running dbstruct run-id fastmode) - (let* ((qry (if fastmode - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;" - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');"))) +(define (db:get-count-tests-running dbstruct run-id) ;; fastmode) + (let* ((qry ;; (if fastmode + ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;" + "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")) ;; ) (db:with-db dbstruct run-id #f (lambda (db) @@ -3245,14 +3251,14 @@ run-id)))) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');") ;; NEW BEHAVIOR: Look only at single run with run-id ;; ;; (define (db:get-running-stats dbstruct run-id) -(define (db:get-count-tests-running-for-run-id dbstruct run-id fastmode) - (let* ((qry (if fastmode - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;" - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;"))) +(define (db:get-count-tests-running-for-run-id dbstruct run-id) ;; fastmode) + (let* ((qry ;; (if fastmode + ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;" + "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) ;; ) (db:with-db dbstruct run-id #f (lambda (db) @@ -3968,17 +3974,135 @@ ;; (db:set-state-status-and-roll-up-items dbstruct run-id test-id #f state status msg) ;; ;; process the test_data table ;; (if (and test-id state status (equal? status "AUTO")) ;; (db:test-data-rollup dbstruct run-id test-id status)) ;; (mt:process-triggers dbstruct run-id test-id state status))) + +;; NOT FINISHED +(define (db:calc-state-status-toplevel state status tl-state tl-status) + `(,state ,status)) + +;; (match state +;; (("COMPLETED") +;; (match `(,tl-state ,tl-status) +;; (("COMPLETED" "PASS") `(,state ,status)) +;; (("COMPLETED" thestatus) +;; (case (string->symbol thestatus) +;; ((ABORT CHECK DEAD) +;; (if `("COMPLETED" ,thestatus)) +;; (match `(,thestatus ,status) +;; (("FAIL" "ABORT") '("COMPLETED" "ABORT")) +;; (("FAIL" "CHECK") '("COMPLETED" "CHECK")) +;; (("FAIL" "DEAD") '("COMPLETED" "DEAD")) +;; (("WARN" "FAIL") '("COMPLETED" "FAIL")) +;; (("WARN" "CHECK") '("COMPLETED" "CHECK")) +;; (("WARN" "DEAD") + +(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) + ;; establish info on incoming test followed by info on top level test + ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met + ;; (mutex-lock! *db-transaction-mutex*) ;; why do we need a mutex? + (let* ((testdat (if (number? test-name) + (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id + (db:get-test-info dbstruct run-id test-name item-path))) + (test-id (db:test-get-id testdat)) + (test-name (if (number? test-name) + (db:test-get-testname testdat) + test-name)) + (no-sync-db (db:no-sync-db #f)) + (rollup-flag #f) + (wait-flag #f) + (rollup-lock-key (conc run-id "-rollup-" test-name)) + (waiting-lock-key (conc run-id "-waiting-" test-name))) + (db:test-set-state-status dbstruct run-id test-id state status #f) + (if (and test-id state status (equal? status "AUTO")) + (db:test-data-rollup dbstruct run-id test-id status)) + (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) + (db:general-call dbstruct 'set-test-start-time (list test-id))) + (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item + (begin + ;; is there a rollup lock? If not, take it + (sqlite3:with-transaction + no-sync-db + (lambda () + ;; (debug:print 0 *default-log-port* "EXCEPTION: exn="exn) + (let* ((rollup-lock-time (db:no-sync-get/default no-sync-db rollup-lock-key #f)) + (waiting-lock-time (db:no-sync-get/default no-sync-db waiting-lock-key #f))) + (if rollup-lock-time ;; someone is doing a rollup + (if (not waiting-lock-time) ;; no one is waiting + (begin + (set! wait-flag #t) + (set! rollup-flag #t) + (db:no-sync-set no-sync-db waiting-lock-key (current-seconds)))) ;; we are going to wait + (begin + (set! rollup-flag #t) + (db:no-sync-set no-sync-db rollup-lock-key (current-seconds))))))) + (if wait-flag + (let loop ((count 10)) ;; about 20 seconds + (thread-sleep! 2) + (if (and (not (db:no-sync-get/default no-sync-db waiting-lock-key #f)) + (> count 0)) + (loop (+ count 1)) + (sqlite3:with-transaction + no-sync-db + (lambda () + (db:no-sync-set no-sync-db rollup-lock-key (current-seconds)) + (db:no-sync-del! no-sync-db waiting-lock-key)))))) + ;; now the rollup + ;; (mutex-unlock! *db-transaction-mutex*) ;; why do we need a mutex? + (if rollup-flag ;; put this into a thread + (begin + ;; (thread-start! (make-thread + ;; (lambda () + (db:roll-up-test-state-status dbstruct run-id test-name state status) + (db:no-sync-del! no-sync-db rollup-lock-key)) + ;; (conc "thread for run-id: " run-id " test-name: " test-name)))))))) + )) + ;; (mutex-unlock! *db-transaction-mutex*) ;; why do we need a mutex? + ))) + +;; I'd like to remove the need for item-path - it is logically not needed here +;; for now we pass in state and status - NOTE: There is a possible race if a test +;; is rapidly re-run while an earlier run is waiting to rollup. +;; +(define (db:roll-up-test-state-status dbstruct run-id test-name state status) + (let* ((testdat (if (number? test-name) + (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id + (db:get-test-info dbstruct run-id test-name ""))) + (test-id (db:test-get-id testdat)) + (test-name (if (number? test-name) + (db:test-get-testname testdat) + test-name)) + (tl-testdat (db:get-test-info dbstruct run-id test-name "")) + (tl-test-id (db:test-get-id tl-testdat))) + (db:with-db + dbstruct #f #f + (lambda (db) + ;; NB// Pass the db so it is part fo the transaction + ;; item-path is used in get-all-state-status counts to exclude current state/status of THIS test + ;; but with the state/status being set earlier this is not needed any longer + (let* ((state-status-counts (db:get-all-state-status-counts-for-testname dbstruct run-id test-name)) + (state-statuses (if (null? state-status-counts) + '() + (db:roll-up-rules state-status-counts state status))) + (newstate (if (null? state-statuses) + state + (car state-statuses))) + (newstatus (if (null? state-statuses) + status + (cadr state-statuses)))) + (if tl-test-id + (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) + ))) + #t)) ;; state is the priority rollup of all states ;; status is the priority rollup of all completed statesfu ;; ;; if test-name is an integer work off that instead of test-name test-path ;; -(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) +(define (db:set-state-status-and-roll-up-items-orig dbstruct run-id test-name item-path state status comment) ;; establish info on incoming test followed by info on top level test ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met (let* ((testdat (if (number? test-name) (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id (db:get-test-info dbstruct run-id test-name item-path))) @@ -3999,17 +4123,17 @@ (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () - ;; NB// Pass the db so it is part fo the transaction + ;; NB// Pass the db so it is part of the transaction (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item (let* ((state-status-counts (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test - (state-stauses (db:roll-up-rules state-status-counts state status)) - (newstate (car state-stauses)) - (newstatus (cadr state-stauses))) + (state-statuses (db:roll-up-rules state-status-counts state status)) + (newstate (car state-statuses)) + (newstatus (cadr state-statuses))) (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts) " state-status-counts: " (apply conc (map (lambda (x) (conc (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) @@ -4022,81 +4146,84 @@ (if (and test-id state status (equal? status "AUTO")) (db:test-data-rollup dbstruct run-id test-id status)) tr-res))))) (define (db:roll-up-rules state-status-counts state status) - (let* ((running (length (filter (lambda (x) - (member (dbr:counts-state x) *common:running-states*)) - state-status-counts))) - (bad-not-started (length (filter (lambda (x) - (and (equal? (dbr:counts-state x) "NOT_STARTED") - (not (member (dbr:counts-status x) *common:not-started-ok-statuses*)))) - state-status-counts))) - (all-curr-states (common:special-sort ;; worst -> best (sort of) - (delete-duplicates - (if (and state (not (member state *common:dont-roll-up-states*))) - (cons state (map dbr:counts-state state-status-counts)) - (map dbr:counts-state state-status-counts))) - *common:std-states* >)) - (all-curr-statuses (common:special-sort ;; worst -> best - (delete-duplicates - (if (and state status (not (member state *common:dont-roll-up-states*))) - (cons status (map dbr:counts-status state-status-counts)) - (map dbr:counts-status state-status-counts))) - *common:std-statuses* >)) - (non-completes (filter (lambda (x) - (not (member x (cons "COMPLETED" *common:dont-roll-up-states*)))) - all-curr-states)) - (preq-fails (filter (lambda (x) - (equal? x "PREQ_FAIL")) - all-curr-statuses)) - (num-non-completes (length non-completes)) - (newstate (cond - ((> running 0) "RUNNING") ;; anything running, call the situation running - ((> (length preq-fails) 0) "NOT_STARTED") - ((> bad-not-started 0) "COMPLETED") ;; we have an ugly situation, it is completed in the sense we cannot do more. - ((> num-non-completes 0) (car non-completes)) ;; (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED - (else (car all-curr-states)))) - (newstatus (cond - ((> (length preq-fails) 0) "PREQ_FAIL") - ((or (> bad-not-started 0) - (and (equal? newstate "NOT_STARTED") - (> num-non-completes 0))) - "STARTED") - (else (car all-curr-statuses))))) - (debug:print-info 2 *default-log-port* - "\n--> probe db:set-state-status-and-roll-up-items: " - "\n--> state-status-counts: "(map dbr:counts->alist state-status-counts) - "\n--> running: "running - "\n--> bad-not-started: "bad-not-started - "\n--> non-non-completes: "num-non-completes - "\n--> non-completes: "non-completes - "\n--> all-curr-states: "all-curr-states + (let* ((running (length (filter (lambda (x) + (member (dbr:counts-state x) *common:running-states*)) + state-status-counts))) + (bad-not-started (length (filter (lambda (x) + (and (equal? (dbr:counts-state x) "NOT_STARTED") + (not (member (dbr:counts-status x) *common:not-started-ok-statuses*)))) + state-status-counts))) + (all-curr-states (common:special-sort ;; worst -> best (sort of) + (delete-duplicates + (if (and state (not (member state *common:dont-roll-up-states*))) + (cons state (map dbr:counts-state state-status-counts)) + (map dbr:counts-state state-status-counts))) + *common:std-states* >)) + (all-curr-statuses (common:special-sort ;; worst -> best + (delete-duplicates + (if (and state status (not (member state *common:dont-roll-up-states*))) + (cons status (map dbr:counts-status state-status-counts)) + (map dbr:counts-status state-status-counts))) + *common:std-statuses* >)) + (non-completes (filter (lambda (x) + (not (member x (cons "COMPLETED" *common:dont-roll-up-states*)))) + all-curr-states)) + (preq-fails (filter (lambda (x) + (equal? x "PREQ_FAIL")) + all-curr-statuses)) + (num-non-completes (length non-completes)) + (newstate (cond + ((> running 0) "RUNNING") ;; anything running, call the situation running + ((> (length preq-fails) 0) "NOT_STARTED") + ((> bad-not-started 0) "COMPLETED") ;; we have an ugly situation, it is completed in the sense we cannot do more. + ((> num-non-completes 0) (car non-completes)) ;; (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED + (else (car all-curr-states)))) + (newstatus (cond + ((> (length preq-fails) 0) "PREQ_FAIL") + ((or (> bad-not-started 0) + (and (equal? newstate "NOT_STARTED") + (> num-non-completes 0))) + "STARTED") + (else (car all-curr-statuses))))) + (debug:print-info 2 *default-log-port* + "\n--> probe db:set-state-status-and-roll-up-items: " + "\n--> state-status-counts: "(map dbr:counts->alist state-status-counts) + "\n--> running: "running + "\n--> bad-not-started: "bad-not-started + "\n--> non-non-completes: "num-non-completes + "\n--> non-completes: "non-completes + "\n--> all-curr-states: "all-curr-states "\n--> all-curr-statuses: "all-curr-statuses "\n--> newstate "newstate "\n--> newstatus "newstatus "\n\n") - - ;; NB// Pass the db so it is part of the transaction - (list newstate newstatus))) + ;; NB// Pass the db so it is part of the transaction + (list newstate newstatus))) (define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status) - (mutex-lock! *db-transaction-mutex*) + ;; (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f #f (lambda (db) (let ((tr-res (sqlite3:with-transaction db (lambda () (let* ((state-status-counts (db:get-all-state-status-counts-for-run dbstruct run-id)) - (state-stauses (db:roll-up-rules state-status-counts #f #f )) - (newstate (car state-stauses)) - (newstatus (cadr state-stauses))) + (state-statuses (db:roll-up-rules state-status-counts #f #f )) + (newstate (if (null? state-statuses) + curr-state + (car state-statuses))) + (newstatus (if (null? state-statuses) + curr-status + (cadr state-statuses)))) (if (or (not (eq? newstate curr-state)) (not (eq? newstatus curr-status))) (db:set-run-state-status dbstruct run-id newstate newstatus ))))))) - (mutex-unlock! *db-transaction-mutex*) + ;; (mutex-unlock! *db-transaction-mutex*) tr-res)))) (define (db:get-all-state-status-counts-for-run dbstruct run-id) (let* ((test-count-recs (db:with-db @@ -4148,10 +4275,25 @@ (unrelated-rec-list (filter nonmatch-countrec-lambda other-items-count-recs))) (cons updated-count-rec unrelated-rec-list))) + +;; full count not including toplevel +;; +(define (db:get-all-state-status-counts-for-testname dbstruct run-id test-name) + (let* ((test-count-recs (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:map-row + (lambda (state status count) + (make-dbr:counts state: state status: status count: count)) + db + "SELECT state,status,count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' GROUP BY state,status;" + run-id test-name))))) + test-count-recs)) + ;; (define (db:get-all-item-states db run-id test-name) ;; (sqlite3:map-row ;; (lambda (a) a) ;; db Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -661,21 +661,21 @@ run-ids)))) (define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) -(define (rmt:get-count-tests-running-for-run-id run-id fastmode) - (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id fastmode))) +(define (rmt:get-count-tests-running-for-run-id run-id) + (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) (define (rmt:get-not-completed-cnt run-id) (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) ;; Statistical queries -(define (rmt:get-count-tests-running run-id fastmode) - (rmt:send-receive 'get-count-tests-running run-id (list run-id fastmode))) +(define (rmt:get-count-tests-running run-id) + (rmt:send-receive 'get-count-tests-running run-id (list run-id))) (define (rmt:get-count-tests-running-for-testname run-id testname) (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname))) (define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -321,11 +321,11 @@ (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) - (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no + (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) @@ -1565,11 +1565,11 @@ extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) - (num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes + (num-running (rmt:get-count-tests-running-for-run-id run-id)) (testdat (make-runs:testdat hed: hed tal: tal reg: reg reruns: reruns @@ -1831,31 +1831,31 @@ ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle - (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes + (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) - (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no + (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id))) (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) - (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes + (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 *default-log-port* "Calling Post Hook")