Overview
Comment: | small bugfix to get-cpu-load and policy change so tests marked dead which are running are moved back to running instead of killed |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65 | v1.6525 |
Files: | files | file ages | folders |
SHA1: |
fa5f74982be4138bda1124183612e353 |
User & Date: | bjbarcla on 2019-02-28 14:30:22 |
Other Links: | branch diff | manifest | tags |
Context
2019-03-12
| ||
18:26 | removing chestertons fence check-in: 53ed616f9f user: bjbarcla tags: v1.65-nosleep | |
18:22 | Add kill-runs to actions applicatble to remove-keep check-in: 183f89d345 user: mrwellan tags: v1.65 | |
2019-02-28
| ||
14:30 | small bugfix to get-cpu-load and policy change so tests marked dead which are running are moved back to running instead of killed check-in: fa5f74982b user: bjbarcla tags: v1.65, v1.6525 | |
2019-02-27
| ||
19:47 | bumped version to v1.6525 check-in: 3a17917329 user: bjbarcla tags: v1.65 | |
Changes
Modified common.scm from [be82152a65] to [c41ac723cd].
︙ | ︙ | |||
1591 1592 1593 1594 1595 1596 1597 | exn #f (with-output-to-file fullpath (lambda ()(pp dat)))))) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load remote-host) | > > > | | | | | | | | | | | | 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 | exn #f (with-output-to-file fullpath (lambda ()(pp dat)))))) ;; get cpu load by reading from /proc/loadavg, return all three values ;; (define (common:get-cpu-load remote-host) (handle-exceptions exn '(99 99 99) (let* ((actual-hostname (or remote-host (get-host-name) "localhost"))) (or (common:get-cached-info actual-hostname "cpu-load") (let ((result (if remote-host (map (lambda (res) (if (eof-object? res) 9e99 res)) (with-input-from-pipe (conc "ssh " remote-host " cat /proc/loadavg") (lambda ()(list (read)(read)(read))))) (with-input-from-file "/proc/loadavg" (lambda ()(list (read)(read)(read))))))) (common:write-cached-info actual-hostname "cpu-load" result) result))))) ;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads ;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc. ;; keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load ;; (define (common:get-normalized-cpu-load remote-host) (let ((res (common:get-normalized-cpu-load-raw remote-host)) |
︙ | ︙ |
Modified launch.scm from [8c6f051622] to [9f824ebdab].
︙ | ︙ | |||
411 412 413 414 415 416 417 | ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") | > > | | | 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 | ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) |
︙ | ︙ |