Overview
Comment: | When checking for running tests if on same host do not use ssh |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v2.0001 |
Files: | files | file ages | folders |
SHA1: |
de21785cce250a405cedf39c13b8fbeb |
User & Date: | mrwellan on 2022-02-15 11:30:24 |
Other Links: | branch diff | manifest | tags |
Context
2022-02-17
| ||
12:27 | Merged fork check-in: 162628b5d6 user: mrwellan tags: v2.0001 | |
2022-02-16
| ||
10:31 | Rebase forward. Closed-Leaf check-in: 09cc793198 user: mrwellan tags: v2.0001-disallow-eof | |
09:59 | rebase of v2.0001-dashboard Closed-Leaf check-in: 235f4e077d user: mrwellan tags: v2.0001-dashboard | |
2022-02-15
| ||
11:30 | When checking for running tests if on same host do not use ssh check-in: de21785cce user: mrwellan tags: v2.0001 | |
2022-02-14
| ||
21:18 | Speculative fix for db:get-status-from-final-status-file (untested) check-in: 0bdb58420b user: mrwellan tags: v2.0001 | |
Changes
Modified commonmod.scm from [875119b082] to [2a227221ee].
︙ | ︙ | |||
2700 2701 2702 2703 2704 2705 2706 | (loop (car tal)(cdr tal) loads proc-num phys-num core-num))))))))))) (define (common:unix-ping hostname) (let ((res (system (conc "ping -c 1 " hostname " > /dev/null")))) (eq? res 0))) (define (launch:is-test-alive host pid) | > > > > | | | | | | | < > | 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 | (loop (car tal)(cdr tal) loads proc-num phys-num core-num))))))))))) (define (common:unix-ping hostname) (let ((res (system (conc "ping -c 1 " hostname " > /dev/null")))) (eq? res 0))) (define (launch:is-test-alive host pid) (let* ((same-host (equal? host (get-host-name))) (cmd (conc (if same-host "" (conc "ssh "host" ")) "pstree -A "pid))) (if (and host pid (not (equal? host "n/a"))) (let* ((output (with-input-from-pipe cmd read-lines))) (debug:print 2 *default-log-port* "Running " cmd " received " output) (if (eq? (length output) 0) #f #t)) #t))) ;; assuming bad query is about a live test is likely not the right thing to do? (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) ;; hosts had better not be changing the number of cpus too often! (or (hash-table-ref/default *numcpus-cache* actual-host #f) (let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (pseudo-random-integer 3600))) (let* ((proc (lambda () |
︙ | ︙ |
Modified launchmod.scm from [9e6a47cc52] to [03e715c08b].
︙ | ︙ | |||
1907 1908 1909 1910 1911 1912 1913 | (debug:print 4 *default-log-port* "look for post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var run-id (conc "end-of-run-" run-id))) (debug:print 0 *default-log-port* "End of Run Detected.") (rmt:set-var run-id (conc "end-of-run-" run-id) "yes") ;(thread-sleep! 10) (runs:run-post-hook run-id) (debug:print 4 *default-log-port* "currseconds: " (current-seconds)" eor: " (rmt:get-var run-id (conc "end-of-run-" run-id))) (common:simple-unlock (conc "endOfRun" run-id))) | | | 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 | (debug:print 4 *default-log-port* "look for post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var run-id (conc "end-of-run-" run-id))) (debug:print 0 *default-log-port* "End of Run Detected.") (rmt:set-var run-id (conc "end-of-run-" run-id) "yes") ;(thread-sleep! 10) (runs:run-post-hook run-id) (debug:print 4 *default-log-port* "currseconds: " (current-seconds)" eor: " (rmt:get-var run-id (conc "end-of-run-" run-id))) (common:simple-unlock (conc "endOfRun" run-id))) (debug:print 0 *default-log-port* "End of Run Detected but not running post hook. This should happen when eor is set to yes. This will happen only when 2 tests exit at same time. eor= " (rmt:get-var run-id (conc "end-of-run-" run-id))))) ((> running-cnt 3) (debug:print 0 *default-log-port* "There are " running-cnt " tests running." )) ((> running-cnt 0) (debug:print 0 *default-log-port* "running cnt > 0 but <= 3 kill-running-tests-if-dead" ) (let ((kill-cnt (launch:kill-tests-if-dead run-id))) (if (and all-test-launched (equal? all-test-launched "yes") (eq? kill-cnt running-cnt)) (runs:end-of-run-check run-id)))) ;;todo |
︙ | ︙ | |||
1930 1931 1932 1933 1934 1935 1936 | (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) (define (runs:find-and-mark-incomplete-and-check-end-of-run run-id ovr-deadtime) (rmt:find-and-mark-incomplete run-id ovr-deadtime) (runs:end-of-run-check run-id)) | | < | | | | | | | | | | | | > | | | | | | | 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 | (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) (define (runs:find-and-mark-incomplete-and-check-end-of-run run-id ovr-deadtime) (rmt:find-and-mark-incomplete run-id ovr-deadtime) (runs:end-of-run-check run-id)) ;; only called if there are more than zero running tests (define (launch:kill-tests-if-dead run-id) (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f))) (let loop ((running-test (car running-tests)) (tal (cdr running-tests)) (kill-cnt 0)) (let* ((test-name (vector-ref running-test 2)) (item-path (vector-ref running-test 11)) (test-id (vector-ref running-test 0)) (host (vector-ref running-test 6)) (pid (rmt:test-get-top-process-pid run-id test-id)) (event-time (vector-ref running-test 5)) (duration (vector-ref running-test 12)) (flag 0) (curr-time (current-seconds))) (if (and (< (+ event-time duration 600) curr-time) (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed (begin (debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed") (set! flag 1) (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f))) (if (not (null? tal)) (loop (car tal) (cdr tal) (+ kill-cnt flag)) (+ kill-cnt flag)))))) (define (runs:run-post-hook run-id) (let* ((run-post-hook (configf:lookup *configdat* "runs" "post-hook")) (existing-tests (if run-post-hook (rmt:get-tests-for-run run-id "%" '() '() ;; run-id testpatt states statuses #f #f ;; offset limit |
︙ | ︙ |