Overview
Comment: | Fixed issue with get pid. Reworked task killing code (again). |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.60 |
Files: | files | file ages | folders |
SHA1: |
a89fd0fb73fac3721eba49ea718696ad |
User & Date: | matt on 2014-10-21 23:14:05 |
Other Links: | branch diff | manifest | tags |
Context
2014-10-22
| ||
11:32 | Add exception handler to check for journal files - seems to crap out sometimes check-in: 8d711947e0 user: mrwellan tags: v1.60 | |
2014-10-21
| ||
23:14 | Fixed issue with get pid. Reworked task killing code (again). check-in: a89fd0fb73 user: matt tags: v1.60 | |
14:18 | fixed wrong use of double quotes check-in: 17cd0fbe3e user: mrwellan tags: v1.60, v1.6004 | |
Changes
Modified db.scm from [665aabc3b4] to [5aaa9292b2].
︙ | ︙ | |||
1670 1671 1672 1673 1674 1675 1676 | ;; (define (db:test-set-top-process-pid dbstruct run-id test-id pid) (sqlite3:execute (db:get-db dbstruct run-id) "UPDATE tests SET attemptnum=? WHERE id=?;" pid test-id)) (define (db:test-get-top-process-pid dbstruct run-id test-id) (sqlite3:first-result (db:get-db dbstruct run-id) "SELECT attemptnum FROM tests WHERE id=?;" | | | 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 | ;; (define (db:test-set-top-process-pid dbstruct run-id test-id pid) (sqlite3:execute (db:get-db dbstruct run-id) "UPDATE tests SET attemptnum=? WHERE id=?;" pid test-id)) (define (db:test-get-top-process-pid dbstruct run-id test-id) (sqlite3:first-result (db:get-db dbstruct run-id) "SELECT attemptnum FROM tests WHERE id=?;" test-id)) (define db:test-record-fields '("id" "run_id" "testname" "state" "status" "event_time" "host" "cpuload" "diskfree" "uname" "rundir" "item_path" "run_duration" "final_logf" "comment" "shortdir" "attemptnum")) ;; fields *must* be a non-empty list ;; |
︙ | ︙ |
Modified launch.scm from [32a59e3cd5] to [0c1efd6507].
︙ | ︙ | |||
341 342 343 344 345 346 347 | (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) (if time-exceeded (begin (debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) | < < < < | > > | > > > | | | < < | | | < < < < < < < < | < < | | | < | < < | > > > | 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 | (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) (if time-exceeded (begin (debug:print-info 0 "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f) (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid1 (vector-ref exit-info 0)) (pid2 (rmt:test-get-top-process-pid run-id test-id)) (pids (delete-duplicates (filter number? (list pid1 pid2))))) (if (not (null? pids)) (begin (for-each (lambda (pid) (handle-exceptions exn (debug:print-info 0 "Unable to kill process with pid " pid ", possibly already killed.") (debug:print 0 "WARNING: Request received to kill job " pid) ;; " (attempt # " kill-tries ")") (process-signal pid signal/int) (thread-sleep! 5) (process-signal pid signal/kill))) pids) (tests:test-set-status! run-id test-id "KILLED" "KILLED" (args:get-arg "-m") #f)) (begin (debug:print 0 "ERROR: Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (args:get-arg "-m") #f) ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. (exit))) (if keep-going (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if keep-going (loop (calc-minutes))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f)))) ;; NOTE: Checking twice for keep-going is intentional (th1 (make-thread monitorjob "monitor job")) (th2 (make-thread runit "run job"))) (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) (debug:print-info 0 "Megatest exectute of test " test-name ", item path " item-path " complete. Notifying the db ...") (set! keep-going #f) (thread-join! th1) (thread-sleep! 1) ;; give thread th1 a chance to be done TODO: Verify this is needed. At 0.1 I was getting fail to stop, increased to total of 1.1 sec. (mutex-lock! m) (let* ((item-path (item-list->path itemdat)) ;; only state and status needed - use lazy routine (testinfo (rmt:get-testinfo-state-status run-id test-id))) |
︙ | ︙ |