Megatest: Diff

Differences From Artifact [ace8916649]:

File runs.scm — part of check-in [3213d340ac] at 2020-08-20 20:29:55 on branch v1.65 — Added requirements section support to extract skeleton. Includes runner throttling change (get vars from configs). ==/7.2/0.9/WARN/1201/mars/== (user: mrwellan, size: 157310) [annotate] [blame] [check-ins using] [more...]

To Artifact [f86e80cfc1]:

File runs.scm — part of check-in [9657acc77f] at 2020-08-28 14:24:24 on branch v1.65-cleanup — lock with host/pid (user: matt, size: 158816) [annotate] [blame] [check-ins using]

︙			︙
47 48 49 50 51 52 53 54 55 56 57 58 59 60	runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry can-run-more-tests ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met)	>	47 48 49 50 51 52 53 54 55 56 57 58 59 60 61	runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry can-run-more-tests ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) (load-mgmt-function #f) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met)
︙			︙
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101	((< age 10) #t) (else #f)))) lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 default-log-port "Other runners in flight, giving up some time...")) ~~(thread-sleep! 10))~~ (begin (if (runs:lownoise "runners-softlock-nowait" 360) (debug:print-info 0 default-log-port "No runners in flight, updating softlock")) (let* ((ouf (open-output-file my-lock-file))) (with-output-to-port ouf (lambda ()(print (current-seconds)))) (close-output-port ouf))))	\|	88 89 90 91 92 93 94 95 96 97 98 99 100 101 102	((< age 10) #t) (else #f)))) lock-files))) (if fresh-locks (begin (if (runs:lownoise "runners-softlock-wait" 360) (debug:print-info 0 default-log-port "Other runners in flight, giving up some time...")) (thread-sleep! 2)) (begin (if (runs:lownoise "runners-softlock-nowait" 360) (debug:print-info 0 default-log-port "No runners in flight, updating softlock")) (let* ((ouf (open-output-file my-lock-file))) (with-output-to-port ouf (lambda ()(print (current-seconds)))) (close-output-port ouf))))
︙			︙
211 212 213 214 215 216 217 ~~218~~ 219 220 221 ~~222~~ 223 224 225 226 227 228 229	(let fatal-loop ((count 0)) (handle-exceptions exn (let ((call-chain (get-call-chain)) (msg ((condition-property-accessor 'exn 'message) exn))) (if (< count 5) (begin ;; this call is colliding, do some crude stuff to fix it. ~~(debug:print 0 default-log-port "ERROR: configdat was inaccessible! This should never happen. Retry #" count)~~ (launch:setup force-reread: #t) (fatal-loop (+ count 1))) (begin ~~(debug:print 0 default-log-port "FATAL: configdat was inaccessible! This should never happen. Retried " count ~~" times. Message: " msg)~~~~ (debug:print 0 default-log-port "Call chain:") (with-output-to-port default-log-port (lambda () (print "configdat is >>"configdat"<<") (pp configdat) (pp call-chain)))	\| > \| >	212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232	(let fatal-loop ((count 0)) (handle-exceptions exn (let ((call-chain (get-call-chain)) (msg ((condition-property-accessor 'exn 'message) exn))) (if (< count 5) (begin ;; this call is colliding, do some crude stuff to fix it. (debug:print 0 default-log-port "ERROR: configdat was inaccessible! This should never happen. Retry #" count ", exn=" exn) (launch:setup force-reread: #t) (fatal-loop (+ count 1))) (begin (debug:print 0 default-log-port "FATAL: configdat was inaccessible! This should never happen. Retried " count " times. Message: " msg) (debug:print 0 default-log-port "Call chain:") (with-output-to-port default-log-port (lambda () (print "configdat is >>"configdat"<<") (pp configdat) (pp call-chain)))
︙			︙
309 310 311 312 313 314 315 ~~316 317 318 319 320~~ ~~321 322 323 324 325 326 327~~ 328 329 330 331 332 333 334	(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move ;; to cond clauses below where we determine we ;; have too many jobs running rather than each ;; time the and condition above is true (which ;; seems like always)? ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 default-log-port "waiting for tasks to complete, sleeping briefly ...")) 10) ;; obviously haven't had any work to do for a while (else 0))) ;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? ;; (configf:lookup-number configdat "setup" "inter-test-delay" default: 0.01) ;; ))) (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count))))	> > \| \| \| \| \| > > \| \| \| \| \| \| \|	312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341	(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) ;; (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move ;; ;; to cond clauses below where we determine we ;; ;; have too many jobs running rather than each ;; ;; time the and condition above is true (which ;; ;; seems like always)? ;; ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way ;; 0) ;; ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time ;; (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 default-log-port "waiting for tasks to complete, sleeping briefly ...")) ;; 10) ;; obviously haven't had any work to do for a while ;; (else 0))) ;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? ;; (configf:lookup-number configdat "setup" "inter-test-delay" default: 0.01) ;; ))) (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count))))
︙			︙
376 377 378 379 380 381 382 ~~383~~ 384 385 386 387 388 389 390 391 392 393 ~~394~~ 395 396 397 398 399 400 401	(full-log-fname (conc log-dir "/" log-file))) (if run-pre-hook (if (null? existing-tests) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin ~~(debug:print 0 default-log-port "WARNING: Failed to create " log-dir)~~ #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain default-log-port) ~~(debug:print 0 default-log-port "Message: " ((condition-property-accessor 'exn 'message) exn))~~ (debug:print 0 default-log-port "ERROR: failed to run pre-hook " run-pre-hook ", check the log " log-file)) (debug:print-info 0 default-log-port "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf) (system (conc run-pre-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run."))) (debug:print 0 default-log-port "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run."))))) (define (runs:run-post-hook run-id)	\| \|	383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408	(full-log-fname (conc log-dir "/" log-file))) (if run-pre-hook (if (null? existing-tests) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin (debug:print 0 default-log-port "WARNING: Failed to create " log-dir ", exn=" exn) #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain default-log-port) (debug:print 0 default-log-port "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 default-log-port "ERROR: failed to run pre-hook " run-pre-hook ", check the log " log-file)) (debug:print-info 0 default-log-port "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf) (system (conc run-pre-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run."))) (debug:print 0 default-log-port "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run."))))) (define (runs:run-post-hook run-id)
︙			︙
416 417 418 419 420 421 422 ~~423~~ 424 425 426 427 428 429 430 431 432 433 ~~434~~ 435 436 437 438 439 440 441	(if run-post-hook ;; (if (null? existing-tests) ;; (debug:print 0 default-log-port "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run."))))) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin ~~(debug:print 0 default-log-port "WARNING: Failed to create " log-dir)~~ #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain default-log-port) ~~(debug:print 0 default-log-port "Message: " ((condition-property-accessor 'exn 'message) exn))~~ (debug:print 0 default-log-port "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file)) (debug:print-info 0 default-log-port "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) (system (conc run-post-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) ;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise. (define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon)	\| \|	423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448	(if run-post-hook ;; (if (null? existing-tests) ;; (debug:print 0 default-log-port "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run."))))) (let* ((use-log-dir (if (not (directory-exists? log-dir)) (handle-exceptions exn (begin (debug:print 0 default-log-port "WARNING: Failed to create " log-dir ", exn=" exn) #f) (create-directory log-dir #t) #t) #t)) (start-time (current-seconds)) (actual-logf (if use-log-dir full-log-fname log-file))) (handle-exceptions exn (begin (print-call-chain default-log-port) (debug:print 0 default-log-port "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (debug:print 0 default-log-port "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file)) (debug:print-info 0 default-log-port "running run-post-hook: \"" run-post-hook "\", log is " actual-logf) (system (conc run-post-hook " >> " actual-logf " 2>&1")) (debug:print-info 0 default-log-port "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))))) ;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise. (define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon)
︙			︙
533 534 535 536 537 538 539 ~~540~~ 541 542 543 544 545 546 547	(debug:print-info 0 default-log-port "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") ~~(common:telemetry-log "run-tests"~~ payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) ;; Now generate all the tests lists	\|	540 541 542 543 544 545 546 547 548 549 550 551 552 553 554	(debug:print-info 0 default-log-port "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") #;(common:telemetry-log "run-tests" payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) ;; Now generate all the tests lists
︙			︙
747 748 749 750 751 752 753 ~~754~~ 755 756 757 758 759 760 761	(th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn ~~(debug:print 0 default-log-port "error in calling find-and-mark-incomplete for run-id " run-id)~~ (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) ;; (thread-join! th1) ;; just do the main stuff in the main thread	\|	754 755 756 757 758 759 760 761 762 763 764 765 766 767 768	(th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn (debug:print 0 default-log-port "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ;; (thread-start! th1) (thread-start! th2) ;; (thread-join! th1) ;; just do the main stuff in the main thread
︙			︙
985 986 987 988 989 990 991 ~~992~~ 993 ~~994~~ 995 996 997 998 999 1000 1001	((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) (debug:print-info 4 default-log-port "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) ~~(debug:print-info 0 default-log-port "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default seen-cant-run-tests hed 0)) ;;~~ ;; getting here likely means the system is way overloaded, kill a full minute before continuing ~~(thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)~~ ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 default-log-port "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))	\| \| > > > >	992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012	((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) (debug:print-info 4 default-log-port "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 0 default-log-port "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default seen-cant-run-tests hed 0) ", going to wait 60 sec.") ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!! ;; No runsdat, can't do this yet ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) ;; (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 default-log-port "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
︙			︙
1117 1118 1119 1120 1121 1122 1123 ~~1124~~ 1125 1126 1127 1128 1129 1130 1131	(if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch?	> > > > > > > > > > > > \| > > > > > >	1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160	(if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) ;; well, first lets see if cpu load throttling is enabled. If so wait around until the ;; average cpu load is under the threshold before continuing ;; (if (runs:dat-load-mgmt-function runsdat) ((runs:dat-load-mgmt-function runsdat)) (runs:dat-load-mgmt-function-set! runsdat (lambda () ;; jobtools maxload is useful for where the full Megatest run is done on one machine (if (and (not (common:on-homehost?)) maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))))) (if (and (not (null? prereqs-not-met)) (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 default-log-port "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch?
︙			︙
1190 1191 1192 1193 1194 1195 1196 ~~1197~~ 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 ~~1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224~~ 1225 1226 1227 1228 1229 1230 1231	;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 default-log-port "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. ~~(thread-sleep! 1)~~ ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test ;; ((and have-resources (or (null? prereqs-not-met) (and (member 'toplevel testmode) ;; 'toplevel) (null? non-completed) (not (member 'exclusive testmode))))) ;; (hash-table-delete! max-tries-hash (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! max-tries-hash (make-hash-table)) ~~;; well, first lets see if cpu load throttling is enabled. If so wait around until the~~ ~~;; average cpu load is under the threshold before continuing~~ ~~;; jobtools maxload is useful for where the full Megatest run is done on one machine~~ ~~(if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized~~ ~~(common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))~~ ~~;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues~~ ~~(if maxhomehostload~~ ~~(common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))~~ (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! global-delta) (if (or (not (null? tal))(not (null? reg))) (runs:loop-values tal reg reglen regfull reruns) #f))	\| < < < < < < < < < < < \|	1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249	;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 default-log-port "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. (thread-sleep! 0.25) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test ;; ((and have-resources (or (null? prereqs-not-met) (and (member 'toplevel testmode) ;; 'toplevel) (null? non-completed) (not (member 'exclusive testmode))))) ;; (hash-table-delete! max-tries-hash (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! max-tries-hash (make-hash-table)) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! global-delta) (if (or (not (null? tal))(not (null? reg))) (runs:loop-values tal reg reglen regfull reruns) #f))
︙			︙
1244 1245 1246 1247 1248 1249 1250 ~~1251~~ 1252 1253 1254 1255 ~~1256 1257 1258 1259~~ 1260 1261 1262 1263 1264 1265 1266	prereqs-not-met) ", "))) (if (or (null? fails) (member 'toplevel testmode)) (begin ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 default-log-port "Waiting for more work to do...")) ~~(thread-sleep! 1)~~ (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) ~~(mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))~~ (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 default-log-port "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))	> > \| \| < < <	1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283	prereqs-not-met) ", "))) (if (or (null? fails) (member 'toplevel testmode)) (begin ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 default-log-port "Waiting for more work to do...")) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2")) (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 default-log-port "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))
︙			︙
1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445	)) ;; every time though the loop increment the test/itempatt val. ;; when the min is > max-allowed and none running then force exit ;; (define max-tries-hash (make-hash-table)) ;;====================================================================== ;; runs:run-tests-queue is called by runs:run-tests ;;====================================================================== ;; ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > (define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in all-tests-registry) ;; At this point the list of parent tests is expanded	> > >	1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465	)) ;; every time though the loop increment the test/itempatt val. ;; when the min is > max-allowed and none running then force exit ;; (define max-tries-hash (make-hash-table)) (define (runs:pretty-long-list lst) (if (> (length lst) 8)(append (take lst 3)(list "...")) lst)) ;;====================================================================== ;; runs:run-tests-queue is called by runs:run-tests ;;====================================================================== ;; ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > (define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in all-tests-registry) ;; At this point the list of parent tests is expanded
︙			︙
1612 1613 1614 1615 1616 1617 1618 ~~1619~~ 1620 1621 1622 1623 1624 1625 1626	reruns)))) ;; (loop (car tal)(cdr tal) reg reruns)))) (runs:incremental-print-results run-id) (debug:print 4 default-log-port "TOP OF LOOP => " "test-name: " test-name "\n hed: " hed ~~"\n tal: " ~~tal~~~~ "\n reg: " reg "\n test-record " test-record "\n itemdat: " itemdat "\n items: " items "\n item-path: " item-path "\n waitons: " waitons "\n num-retries: " num-retries	\|	1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646	reruns)))) ;; (loop (car tal)(cdr tal) reg reruns)))) (runs:incremental-print-results run-id) (debug:print 4 default-log-port "TOP OF LOOP => " "test-name: " test-name "\n hed: " hed "\n tal: " (runs:pretty-long-list tal) "\n reg: " reg "\n test-record " test-record "\n itemdat: " itemdat "\n items: " items "\n item-path: " item-path "\n waitons: " waitons "\n num-retries: " num-retries
︙			︙
1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676	((not items) (debug:print-info 4 default-log-port "cond branch - " "rtq-2") (debug:print-info 4 default-log-port "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (let ((loop-list (runs:process-expanded-tests runsdat testdat))) (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated	> > > > > >	1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702	((not items) (debug:print-info 4 default-log-port "cond branch - " "rtq-2") (debug:print-info 4 default-log-port "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; This would be a good place to block on homehost load (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (let ((loop-list (runs:process-expanded-tests runsdat testdat))) (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated
︙			︙
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 ~~1798~~ 1799 1800 1801 1802 1803 1804 1805	(debug:print-info 4 default-log-port "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ~~(thread-sleep! 5)~~ (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id)	> > > \|	1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834	(debug:print-info 4 default-log-port "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes (prev-num-running 0)) ;; (debug:print-info 13 default-log-port "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup configdat "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 default-log-port "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no (debug:print-info 0 default-log-port "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 default-log-port "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 default-log-port "Calling Post Hook") ;; (runs:run-post-hook run-id)
︙			︙
1850 1851 1852 1853 1854 1855 1856 ~~1857~~ 1858 1859 1860 1861 1862 1863 1864	(if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; ~~(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry)~~ ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 ;; (let* ((test-name (tests:testqueue-get-testname test-record))	\|	1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893	(if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; (define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat) ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 ;; (let* ((test-name (tests:testqueue-get-testname test-record))
︙			︙
1928 1929 1930 1931 1932 1933 1934 ~~1935~~ ~~1936~~ 1937 1938 1939 1940 1941 1942 1943	(debug:print 2 default-log-port "WARN: Test not pre-created? test-name=" test-name ", item-path=" item-path ", run-id=" run-id) (rmt:register-test run-id test-name item-path) (set! test-id (rmt:get-test-id run-id test-name item-path)))) (debug:print-info 4 default-log-port "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"") (set! testdat (rmt:get-test-info-by-id run-id test-id)) (if (not testdat) (begin ~~(debug:print-info 0 default-log-port "WARNING: server is overloaded, trying again in ~~one~~ second")~~ ~~(thread-sleep! 1)~~ (loop))))) (if (not testdat) ;; should NOT happen (debug:print-error 0 default-log-port "failed to get test record for test-id " test-id)) (set! test-id (db:test-get-id testdat)) (if (common:file-exists? test-path) (change-directory test-path) (begin	\| > \|	1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973	(debug:print 2 default-log-port "WARN: Test not pre-created? test-name=" test-name ", item-path=" item-path ", run-id=" run-id) (rmt:register-test run-id test-name item-path) (set! test-id (rmt:get-test-id run-id test-name item-path)))) (debug:print-info 4 default-log-port "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"") (set! testdat (rmt:get-test-info-by-id run-id test-id)) (if (not testdat) (begin (debug:print-info 0 default-log-port "WARNING: server is overloaded, trying again in two seconds") ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 2) (loop))))) (if (not testdat) ;; should NOT happen (debug:print-error 0 default-log-port "failed to get test record for test-id " test-id)) (set! test-id (db:test-get-id testdat)) (if (common:file-exists? test-path) (change-directory test-path) (begin
︙			︙
2041 2042 2043 2044 2045 2046 2047 ~~2048~~ 2049 2050 2051 2052 2053 2054 2055	(begin (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test) (debug:print-info 1 default-log-port "SKIPPING Test " full-test-name " due to " skip-test)) ;; ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) ~~(begin~~ (print "ERROR: Failed to launch the test. Exiting as soon as possible") (set! globalexitstatus 1) ;; (process-signal (current-process-id) signal/kill)))))))) ((KILLED) (debug:print 1 default-log-port "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.") (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED)) ((LAUNCHED REMOTEHOSTSTART RUNNING)	\|	2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085	(begin (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test) (debug:print-info 1 default-log-port "SKIPPING Test " full-test-name " due to " skip-test)) ;; ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) (begin (print "ERROR: Failed to launch the test. Exiting as soon as possible") (set! globalexitstatus 1) ;; (process-signal (current-process-id) signal/kill)))))))) ((KILLED) (debug:print 1 default-log-port "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.") (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED)) ((LAUNCHED REMOTEHOSTSTART RUNNING)
︙			︙
2117 2118 2119 2120 2121 2122 2123 ~~2124~~ 2125 2126 2127 2128 2129 2130 2131	;; (define (runs:get-hash-by-target target-patts runpatt) (let* ((targets (string-split target-patts ",")) (keys (rmt:get-keys)) (res-ht (make-hash-table))) ;; target -> ( runrecord1 runrecord2 ... ) (for-each (lambda (target-patt) ~~(let ((runs (rmt:simple-get-runs runpatt #f #f target-patt)))~~ (for-each (lambda (run) (let ((target (simple-run-target run))) (hash-table-set! res-ht target (cons run (hash-table-ref/default res-ht target '()))))) runs))) targets) res-ht))	\|	2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161	;; (define (runs:get-hash-by-target target-patts runpatt) (let* ((targets (string-split target-patts ",")) (keys (rmt:get-keys)) (res-ht (make-hash-table))) ;; target -> ( runrecord1 runrecord2 ... ) (for-each (lambda (target-patt) (let ((runs (rmt:simple-get-runs runpatt #f #f target-patt #f))) (for-each (lambda (run) (let ((target (simple-run-target run))) (hash-table-set! res-ht target (cons run (hash-table-ref/default res-ht target '()))))) runs))) targets) res-ht))