Index: NOTES ================================================================== --- NOTES +++ NOTES @@ -158,5 +158,9 @@ INFO: (0) Server shutdown complete. Exiting Start: 0 at Sun Apr 28 22:18:25 MST 2013 Max: 52 at Sun Apr 28 23:06:59 MST 2013 End: 6 at Sun Apr 28 23:47:51 MST 2013 + +======================================================================== + + Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -185,13 +185,11 @@ partial-path-index) #f)) ;; we need our archive dir checked for every test to enable folks who want to store other ways. (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target run-name test-name)) (archive-dir (if archive-info (cdr archive-info) #f)) - (archive-id (if archive-info (car archive-info) -1)) - - ) + (archive-id (if archive-info (car archive-info) -1))) (if (not archive-dir) ;; no archive disk found, this is fatal (begin (debug:print 0 *default-log-port* "FATAL: No archive disks found. Please add disks with at least " min-space " MB space to the [archive-disks] section of megatest.config") @@ -312,12 +310,14 @@ (for-each (lambda (test-dat) (let ((test-id (db:test-get-id test-dat)) (run-id (db:test-get-run_id test-dat))) (rmt:test-set-archive-block-id run-id test-id archive-id) - (if (member archive-command '("save-remove")) - (runs:remove-test-directory test-dat 'archive-remove)))) + (if (member (symbol->string archive-command) '("save-remove")) + (begin + (debug:print-info 0 *default-log-port* "remove testdat") + (runs:remove-test-directory test-dat 'archive-remove))))) (hash-table-ref test-groups test-base))))) (hash-table-keys disk-groups)) #t)) (define (archive:bup-restore archive-command run-id run-name tests rp-mutex bup-mutex) ;; move the getting of archive space down into the below block so that a single run can Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -716,14 +716,18 @@ ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) + (let ((fmod-time (handle-exceptions + ext + (current-seconds) + (file-modification-time fname)))) (if (common:file-exists? fname) - (if (> (- (current-seconds)(file-modification-time fname)) expire-time) + (if (> (- (current-seconds) fmod-time) expire-time) (begin - (handle-exceptions exn #f (delete-file* fname)) + (handle-exceptions exn #f (delete-file* fname)) (common:simple-file-lock fname expire-time: expire-time)) #f) (let ((key-string (conc (get-host-name) "-" (current-process-id)))) (with-output-to-file fname (lambda () @@ -733,11 +737,11 @@ (handle-exceptions exn #f (with-input-from-file fname (lambda () (equal? key-string (read-line))))) - #f)))) + #f))))) (define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) (let ((end-time (+ expire-time (current-seconds)))) (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) (if got-lock @@ -1733,11 +1737,17 @@ exn #f (debug:print 2 *default-log-port* "reading file " fullpath) (let ((real-age (- (current-seconds)(file-change-time fullpath)))) (if (< real-age age) - (with-input-from-file fullpath read) + (handle-exceptions + exn + (begin + (debug:print-info 1 *default-log-port* " removing bad file " fullpath) + (delete-file* fullpath) + #f) + (with-input-from-file fullpath read)) (begin (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 *default-log-port* "not reading file " fullpath) @@ -2044,11 +2054,13 @@ (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again (common:get-num-cpus remote-host) numcpus-in)) (maxload (if force-maxload maxload-in - (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? + (if (number? maxload-in) + (max maxload-in 0.5) + 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where ;; numcpus (or could be ;; maxload) is zero, Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1698,11 +1698,11 @@ (begin (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) db "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');" - run-id remotehoststart-deadtime) ;; default time 230 seconds. + run-id remotehoststart-deadtime) ;; default time 230 seconds ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row @@ -1732,36 +1732,37 @@ ;; incompleted)) (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin + ;; (launch:is-test-alive "localhost" 435) (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD") (for-each (lambda (test-id) - (let* ( - (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) - (result (db:get-status-from-final-status-file run-dir))) - (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") - (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS" "Test stopped responding but it has PASSED; marking it PASS in the DB.") - ) - (begin - (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result) - (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.") - ) - ) - ) - ) - all-ids) - ) - ) - ) - ) - ) - ) -) + (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id)) + (tinfo (db:get-test-info-by-id dbstruct run-id test-id)) + (run-dir (db:test-get-rundir tinfo)) + (host (db:test-get-host tinfo)) + (pid (db:test-get-process_id tinfo)) + (result (db:get-status-from-final-status-file run-dir))) + (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD") + (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS" + "Test stopped responding but it has PASSED; marking it PASS in the DB.")) + (let ((is-alive (launch:is-test-alive host pid))) + (if is-alive + (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host " has a process on pid " pid ", NOT setting to DEAD.") + (begin + (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result) + (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" + "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))))))) + all-ids) + ;;call end of eud of run detection for posthook + (launch:end-of-run-check run-id) + ))))))) + ;; ALL REPLACED BY THE BLOCK ABOVE ;; ;; (sqlite3:execute ;; db Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -769,13 +769,13 @@
[itemopts] +slash path/to/file/with/items +# or +space path/to/file/with/items+
key1/key2/key3 +val1/val2/val2 +...+
key1 key2 key3 +val1 val2 val2 +...+