Overview
Comment: | Speculative fix for failure to stop RUNNING |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.55 |
Files: | files | file ages | folders |
SHA1: |
6563497956b4e501fd15a62b86f7abda |
User & Date: | matt on 2013-08-08 23:03:58 |
Other Links: | branch diff | manifest | tags |
Context
2013-08-11
| ||
19:38 | Added howto section to the manual check-in: a10660b42e user: mrwellan tags: v1.55 | |
2013-08-08
| ||
23:03 | Speculative fix for failure to stop RUNNING check-in: 6563497956 user: matt tags: v1.55 | |
2013-08-05
| ||
20:10 | Typo in version file. Fixed check-in: 8e90258572 user: mrwellan tags: v1.55, v1.5511ww32 | |
Changes
Modified launch.scm from [7692281ad3] to [093abc1258].
︙ | ︙ | |||
148 149 150 151 152 153 154 155 156 157 158 159 160 161 | ;; any previous runs ;; (db:test-remove-steps db run-id testname itemdat) (let* ((m (make-mutex)) (kill-job? #f) (exit-info (vector #t #t #t)) (job-thread #f) (runit (lambda () ;; (let-values ;; (((pid exit-status exit-code) ;; (run-n-wait fullrunscript))) (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) ;; if there is a runscript do it first (if fullrunscript | > | 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | ;; any previous runs ;; (db:test-remove-steps db run-id testname itemdat) (let* ((m (make-mutex)) (kill-job? #f) (exit-info (vector #t #t #t)) (job-thread #f) (keep-going #t) (runit (lambda () ;; (let-values ;; (((pid exit-status exit-code) ;; (run-n-wait fullrunscript))) (tests:test-set-status! test-id "RUNNING" "n/a" #f #f) ;; if there is a runscript do it first (if fullrunscript |
︙ | ︙ | |||
287 288 289 290 291 292 293 294 295 296 297 298 299 300 | #t) #f))))) ;; open-run-close not needed for test-set-meta-info (tests:set-meta-info #f test-id run-id test-name itemdat minutes work-area) (if kill-job? (begin (mutex-lock! m) (let* ((pid (vector-ref exit-info 0))) (if (number? pid) (process-signal pid signal/kill) ;; (begin ;; (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")") ;; (let ((processes (cmd-run->list (conc "pgrep -l -P " pid)))) ;; (for-each | > > > | 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 | #t) #f))))) ;; open-run-close not needed for test-set-meta-info (tests:set-meta-info #f test-id run-id test-name itemdat minutes work-area) (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid (vector-ref exit-info 0))) (if (number? pid) (process-signal pid signal/kill) ;; (begin ;; (debug:print 0 "WARNING: Request received to kill job (attempt # " kill-tries ")") ;; (let ((processes (cmd-run->list (conc "pgrep -l -P " pid)))) ;; (for-each |
︙ | ︙ | |||
314 315 316 317 318 319 320 | (tests:test-set-status! test-id "KILLED" "FAIL" (args:get-arg "-m") #f) (sqlite3:finalize! tdb) (exit 1)))) (set! kill-tries (+ 1 kill-tries)) (mutex-unlock! m))) ;; (sqlite3:finalize! db) | > > | > | | | > > > | | | | 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 | (tests:test-set-status! test-id "KILLED" "FAIL" (args:get-arg "-m") #f) (sqlite3:finalize! tdb) (exit 1)))) (set! kill-tries (+ 1 kill-tries)) (mutex-unlock! m))) ;; (sqlite3:finalize! db) (if keep-going (begin (thread-sleep! (+ 10 (random 10))) ;; add some jitter to the call home time to spread out the db accesses (if keep-going (loop (calc-minutes)))))))))) ;; NOTE: Checking twice for keep-going is intentional (th1 (make-thread monitorjob "monitor job")) (th2 (make-thread runit "run job"))) (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) (set! keep-going #f) (thread-sleep! 1) (thread-terminate! th1) ;; Not sure if this is a good idea (thread-sleep! 0.1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) (testinfo (cdb:get-test-info-by-id *runremote* test-id))) ;; )) ;; run-id test-name item-path))) ;; Am I completed? (if (equal? (db:test-get-state testinfo) "RUNNING") ;; (not (equal? (db:test-get-state testinfo) "COMPLETED")) (let ((new-state (if kill-job? "KILLED" "COMPLETED") ;; (if (eq? (vector-ref exit-info 2) 0) ;; exited with "good" status ;; "COMPLETED" ;; (db:test-get-state testinfo))) ;; else preseve the state as set within the test ) (new-status (cond ((not (vector-ref exit-info 1)) "FAIL") ;; job failed to run ((eq? rollup-status 0) ;; if the current status is AUTO then defer to the calculated value (i.e. leave this AUTO) (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO" "PASS")) ((eq? rollup-status 1) "FAIL") ((eq? rollup-status 2) ;; if the current status is AUTO the defer to the calculated value but qualify (i.e. make this AUTO-WARN) (if (equal? (db:test-get-status testinfo) "AUTO") "AUTO-WARN" "WARN")) (else "FAIL")))) ;; (db:test-get-status testinfo))) (debug:print-info 1 "Test exited in state=" (db:test-get-state testinfo) ", setting state/status based on exit code of " (vector-ref exit-info 1) " and rollup-status of " rollup-status) (tests:test-set-status! test-id new-state new-status (args:get-arg "-m") #f) ;; need to update the top test record if PASS or FAIL and this is a subtest (if (not (equal? item-path "")) (cdb:roll-up-pass-fail-counts *runremote* run-id test-name item-path new-status)) |
︙ | ︙ |
tests/installall/config/megatest.config.dat became a regular file with contents [736a5da885].
tests/installall/config/runconfigs.config.dat became a regular file with contents [3b8f260acb].