Overview
Comment: | added zombie test sweep to launch:execute |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-telemetry |
Files: | files | file ages | folders |
SHA1: |
0db6a22974d200dba4cc949284a96ed5 |
User & Date: | bjbarcla on 2019-02-20 18:22:56 |
Other Links: | branch diff | manifest | tags |
Context
2019-02-20
| ||
18:24 | added zombie test sweep to launch:execute check-in: 071b3404c3 user: bjbarcla tags: v1.65-telemetry | |
18:22 | added zombie test sweep to launch:execute check-in: 0db6a22974 user: bjbarcla tags: v1.65-telemetry | |
16:21 | added loop to wait for testconfig to show up (combat nfs cache latency) ; added testconfig path derived from launch:execute payload to search path in case tests:get-all has gaps due to nfs cache latency check-in: 71819d884d user: bjbarcla tags: v1.65-telemetry | |
Changes
Modified db.scm from [7ee427f00f] to [f81e996991].
︙ | ︙ | |||
1700 1701 1702 1703 1704 1705 1706 | ;; (or (not dbexists) ;; if no file then something wrong - mark as incomplete ;; (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim ;; incompleted)) (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin | | | | | 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 | ;; (or (not dbexists) ;; if no file then something wrong - mark as incomplete ;; (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim ;; incompleted)) (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD") (for-each (lambda (test-id) (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS. ref ticket 220546828 all-ids)))))))) ;; ALL REPLACED BY THE BLOCK ABOVE ;; ;; (sqlite3:execute ;; db ;; (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" |
︙ | ︙ |
Modified launch.scm from [3d75421951] to [14b5655bbb].
︙ | ︙ | |||
393 394 395 396 397 398 399 | (get-df (current-directory)) disk-free)) (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) | | | > > > > > > > | > > | < < < | | > > > | > > > > | 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | (get-df (current-directory)) disk-free)) (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests") (set! kill-job? #t))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (if do-sync (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? (let* ((pid1 (launch:einf-pid exit-info)) ;; (vector-ref exit-info 0)) (pid2 (rmt:test-get-top-process-pid run-id test-id)) (pids (delete-duplicates (filter number? (list pid1 pid2))))) |
︙ | ︙ | |||
440 441 442 443 444 445 446 | (handle-exceptions exn #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) pids) | | | | 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 | (handle-exceptions exn #f (process-signal pid-num signal/kill))) (process:get-sub-pids pid)))) ;; (debug:print-info 0 *default-log-port* "not killing process " pid " as it is not alive")))) pids) (tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) (begin (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2) (tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses |
︙ | ︙ | |||
618 619 620 621 622 623 624 | ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) | | | 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 | ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) (else ;; (member (db:test-get-state test-info) '("REMOVING" "REMOTEHOSTSTART" "RUNNING" "KILLREQ")) (debug:print-error 0 *default-log-port* "test state is " (db:test-get-state test-info) ", cannot proceed") (exit)))) (debug:print 2 *default-log-port* "Executing " test-name " (id: " test-id ") on " (get-host-name)) (set! keys (rmt:get-keys)) ;; (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;; these may be needed by the launching process ;; one of these is defunct/redundant ... (if (not (launch:setup force-reread: #t)) (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") ;; (sqlite3:finalize! db) |
︙ | ︙ | |||
1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 | (list lnkpathf lnkpath )) (if (and test-src-path (> remtries 0)) (begin (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) (list #f #f))))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch | > > > > > > > > > > > > > > > > > > > > > > | 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 | (list lnkpathf lnkpath )) (if (and test-src-path (> remtries 0)) (begin (debug:print-error 0 *default-log-port* "Failed to create work area at " test-path " with link at " lnktarget ", remaining attempts " remtries) ;; (create-work-area run-id run-info keyvals test-id test-src-path disk-path testname itemdat remtries: (- remtries 1))) (list #f #f))))) (define (launch:handle-zombie-tests run-id) (let* ((key (conc "zombiescan-runid-"run-id)) (now (current-seconds)) (threshold (- (current-seconds) (* 2 (or (configf:lookup-number *configdat* "setup" "deadtime") 120)))) (val (rmt:get-var key)) (do-scan? (cond ((not val) #t) ((< val threshold) #t) (else #f)))) (when do-scan? (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch |
︙ | ︙ |