Overview
Comment: | Merged in last of testdat changes from testdat2 via meld |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65 |
Files: | files | file ages | folders |
SHA1: |
21849054cacf4bd4d07ebc04019fba05 |
User & Date: | matt on 2020-10-27 23:44:55 |
Other Links: | branch diff | manifest | tags |
Context
2020-10-28
| ||
09:58 | Added to manual ezsteps section check-in: 60c23b1e99 user: mrwellan tags: v1.65 | |
2020-10-27
| ||
23:44 | Merged in last of testdat changes from testdat2 via meld check-in: 21849054ca user: matt tags: v1.65 | |
22:43 | Updated ezsteps propagation and documentation check-in: b8cc18503a user: matt tags: v1.65 | |
Changes
Modified db.scm from [7fd742bffc] to [00e75ddfd8].
︙ | ︙ | |||
1849 1850 1851 1852 1853 1854 1855 | ;; HOWEVER: this code in run:test seems to work fine ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) | > | | | | | | | | | | | > | | | | | | | | | | | | 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 | ;; HOWEVER: this code in run:test seems to work fine ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) " event-time="event-time" run-duration="run-duration))))) stmth1 run-id running-deadtime) ;; default time 720 seconds (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time " run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))) stmth2 run-id remotehoststart-deadtime) ;; default time 230 seconds ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row |
︙ | ︙ |
Modified launch.scm from [0de0e6aa2e] to [b35a784abe].
︙ | ︙ | |||
207 208 209 210 211 212 213 | (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) | | | 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - ;; top of loop encountered at "(current-seconds)" with |
︙ | ︙ | |||
246 247 248 249 250 251 252 | ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") | | | 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 | ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (if do-sync ;; save meta data about the running of this test |
︙ | ︙ | |||
311 312 313 314 315 316 317 | (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) | | | 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 | (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) ;;(bb-check-path msg: "launch:execute incoming") |
︙ | ︙ |
Modified rmt.scm from [2716a38de0] to [05738b4bdc].
︙ | ︙ | |||
548 549 550 551 552 553 554 | (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) ;; run-id is NOT used - but it will be! ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) | | > > > > > > > > > > > > > > > > > | 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 | (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) ;; run-id is NOT used - but it will be! ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))) (trundir (vector-ref testdat 10)) (trundatf (conc trundir"/.mt_data/test-run.dat"))) ;; now we can update a couple fields from the filesystem (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Could not update testdat record from "trundatf", exn=" exn) #f) (if (and trundir (file-exists? trundatf)) (let* ((duration (vector-ref testdat 12)) ;; (db:test-get-run_duration testdat)) (event-time (vector-ref testdat 5)) ;; (db:test-get-event_time testdat)) (last-touch (file-modification-time trundatf)) (new-duration (max duration (- last-touch event-time)))) (vector-set! testdat 12 new-duration)))) #;(db:test-set-run_duration! testdat (max duration (- last-touch event-time))) testdat) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) |
︙ | ︙ |