Overview
Comment: | trim some load control sleeps in runs.scm that are not needed as we have new load control ==/9.8/2.2/WARN/1203/mars== |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-cleanup |
Files: | files | file ages | folders |
SHA1: |
8280550d675d1c3e45309c3ba2f49e46 |
User & Date: | matt on 2020-08-27 07:10:36 |
Original Comment: | trim some load control sleeps in runs.scm that are not needed as we have new load control |
Other Links: | branch diff | manifest | tags |
Context
2020-08-27
| ||
07:56 | Comment out a telemetry call check-in: def8386816 user: matt tags: v1.65-cleanup | |
07:10 | trim some load control sleeps in runs.scm that are not needed as we have new load control ==/9.8/2.2/WARN/1203/mars== check-in: 8280550d67 user: matt tags: v1.65-cleanup | |
2020-08-26
| ||
23:04 | Tweaked server gate - still not right :( ==/FAIL/orion/== check-in: 7e26fb2f0c user: mrwellan tags: v1.65-cleanup | |
Changes
Modified runs.scm from [96a5410a91] to [b6fddf76c2].
︙ | ︙ | |||
87 88 89 90 91 92 93 | ((< age 10) #t) (else #f)))) lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time...")) | | | 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | ((< age 10) #t) (else #f)))) lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time...")) (thread-sleep! 2)) (begin (if (runs:lownoise "runners-softlock-nowait" 360) (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock")) (let* ((ouf (open-output-file my-lock-file))) (with-output-to-port ouf (lambda ()(print (current-seconds)))) (close-output-port ouf)))) |
︙ | ︙ | |||
311 312 313 314 315 316 317 | (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) | | | | | | | | | | | | | 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 | (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) ;; (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move ;; ;; to cond clauses below where we determine we ;; ;; have too many jobs running rather than each ;; ;; time the and condition above is true (which ;; ;; seems like always)? ;; ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way ;; 0) ;; ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time ;; (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) ;; 10) ;; obviously haven't had any work to do for a while ;; (else 0))) ;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? ;; (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01) ;; ))) (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) |
︙ | ︙ | |||
991 992 993 994 995 996 997 | (null? non-completed)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing | | | 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 | (null? non-completed)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while."))) |
︙ | ︙ | |||
1805 1806 1807 1808 1809 1810 1811 | (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) | | | 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 | (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) (thread-sleep! 1) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 *default-log-port* "Calling Post Hook") ;; (runs:run-post-hook run-id) |
︙ | ︙ | |||
2533 2534 2535 2536 2537 2538 2539 | (debug:print-info 2 *default-log-port* "new state " new-state ", new status " new-status ) (mt:test-set-state-status-by-id run-id test-id new-state new-status #f)) (if (not (null? tal)) (loop (car tal)(cdr tal)))) ((run-wait) ;; BB TODO - manage has-subrun case (debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running") | | | 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 | (debug:print-info 2 *default-log-port* "new state " new-state ", new status " new-status ) (mt:test-set-state-status-by-id run-id test-id new-state new-status #f)) (if (not (null? tal)) (loop (car tal)(cdr tal)))) ((run-wait) ;; BB TODO - manage has-subrun case (debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running") (thread-sleep! 5) (let ((new-tests (proc-get-tests run-id))) (if (null? new-tests) (debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.") (loop (car new-tests)(cdr new-tests))))) ((archive) ;; BB TODO - manage has-subrun case (if (and run-dir (not toplevel-with-children)) |
︙ | ︙ |