Overview
Context
Changes
Modified commonmod.scm
from [875119b082]
to [2a227221ee].
︙ | | |
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
|
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
|
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
-
+
|
(loop (car tal)(cdr tal) loads proc-num phys-num core-num)))))))))))
(define (common:unix-ping hostname)
(let ((res (system (conc "ping -c 1 " hostname " > /dev/null"))))
(eq? res 0)))
(define (launch:is-test-alive host pid)
(let* ((same-host (equal? host (get-host-name)))
(cmd (conc
(if same-host "" (conc "ssh "host" "))
"pstree -A "pid)))
(if (and host pid (not (equal? host "n/a")))
(let* ((cmd (conc "ssh " host " pstree -A " pid))
(output (with-input-from-pipe cmd read-lines)))
(debug:print 2 *default-log-port* "Running " cmd " received " output)
(if (eq? (length output) 0)
#f
#t))
(if (and host pid
(not (equal? host "n/a")))
(let* ((output (with-input-from-pipe cmd read-lines)))
(debug:print 2 *default-log-port* "Running " cmd " received " output)
(if (eq? (length output) 0)
#f
#t))
#t))
#t))) ;; assuming bad query is about a live test is likely not the right thing to do?
(define (common:get-num-cpus remote-host)
(let* ((actual-host (or remote-host (get-host-name))))
;; hosts had better not be changing the number of cpus too often!
(or (hash-table-ref/default *numcpus-cache* actual-host #f)
(let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (pseudo-random-integer 3600)))
(let* ((proc (lambda ()
|
︙ | | |
Modified launchmod.scm
from [9e6a47cc52]
to [03e715c08b].
︙ | | |
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
|
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
|
-
+
|
(debug:print 4 *default-log-port* "look for post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var run-id (conc "end-of-run-" run-id)))
(debug:print 0 *default-log-port* "End of Run Detected.")
(rmt:set-var run-id (conc "end-of-run-" run-id) "yes")
;(thread-sleep! 10)
(runs:run-post-hook run-id)
(debug:print 4 *default-log-port* "currseconds: " (current-seconds)" eor: " (rmt:get-var run-id (conc "end-of-run-" run-id)))
(common:simple-unlock (conc "endOfRun" run-id)))
(debug:print 0 *default-log-port* "End of Run Detected but not running post hook. This should happen when eor is set to yes. This will happen only when 2 tests exit at smae time. eor= " (rmt:get-var run-id (conc "end-of-run-" run-id)))))
(debug:print 0 *default-log-port* "End of Run Detected but not running post hook. This should happen when eor is set to yes. This will happen only when 2 tests exit at same time. eor= " (rmt:get-var run-id (conc "end-of-run-" run-id)))))
((> running-cnt 3)
(debug:print 0 *default-log-port* "There are " running-cnt " tests running." ))
((> running-cnt 0)
(debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" )
(let ((kill-cnt (launch:kill-tests-if-dead run-id)))
(if (and all-test-launched (equal? all-test-launched "yes") (eq? kill-cnt running-cnt))
(runs:end-of-run-check run-id)))) ;;todo
|
︙ | | |
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
|
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
|
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
|
(if (not (null? tal))
(loop (car tal) (cdr tal)))))))))))
(define (runs:find-and-mark-incomplete-and-check-end-of-run run-id ovr-deadtime)
(rmt:find-and-mark-incomplete run-id ovr-deadtime)
(runs:end-of-run-check run-id))
;; only called if there are more than zero running tests
(define (launch:kill-tests-if-dead run-id)
(let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
(let loop ((running-test (car running-tests))
(tal (cdr running-tests))
(kill-cnt 0))
(let* ((test-name (vector-ref running-test 2))
(item-path (vector-ref running-test 11))
(test-id (vector-ref running-test 0))
(host (vector-ref running-test 6))
(pid (rmt:test-get-top-process-pid run-id test-id))
(event-time (vector-ref running-test 5))
(duration (vector-ref running-test 12))
(flag 0)
(curr-time (current-seconds)))
(if (and (< (+ event-time duration 600) curr-time) (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed
(begin
(debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
(let loop ((running-test (car running-tests))
(tal (cdr running-tests))
(kill-cnt 0))
(let* ((test-name (vector-ref running-test 2))
(item-path (vector-ref running-test 11))
(test-id (vector-ref running-test 0))
(host (vector-ref running-test 6))
(pid (rmt:test-get-top-process-pid run-id test-id))
(event-time (vector-ref running-test 5))
(duration (vector-ref running-test 12))
(flag 0)
(curr-time (current-seconds)))
(if (and (< (+ event-time duration 600) curr-time)
(not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed
(begin
(debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
(set! flag 1)
(rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f)))
(if (not (null? tal))
(loop (car tal) (cdr tal) (+ kill-cnt flag))
(+ kill-cnt flag))))))
(if (not (null? tal))
(loop (car tal) (cdr tal) (+ kill-cnt flag))
(+ kill-cnt flag))))))
(define (runs:run-post-hook run-id)
(let* ((run-post-hook (configf:lookup *configdat* "runs" "post-hook"))
(existing-tests (if run-post-hook
(rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses
#f #f ;; offset limit
|
︙ | | |