Overview
Comment: | Queue ideosyncracies seem fixed. Needs more testing... |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.55 |
Files: | files | file ages | folders |
SHA1: |
79361f52227af77ab374fe3141add941 |
User & Date: | matt on 2013-09-10 00:27:10 |
Other Links: | branch diff | manifest | tags |
Context
2013-09-10
| ||
12:04 | Fixed couple remaining issues with queue handling check-in: a2c11c53bc user: mrwellan tags: v1.55 | |
00:27 | Queue ideosyncracies seem fixed. Needs more testing... check-in: 79361f5222 user: matt tags: v1.55 | |
2013-09-09
| ||
16:41 | partially borked change to better deal with run queue idiosyncracies check-in: fa1ff570f2 user: mrwellan tags: v1.55 | |
Changes
Modified runs.scm from [de10964a2e] to [c1ce52ca59].
︙ | ︙ | |||
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | ;; (define *seen-cant-run-tests* (make-hash-table)) ;; use to track tests that we suspect cannot be run (define (runs:inc-cant-run-tests testname) (hash-table-set! *seen-cant-run-tests* testname (+ (hash-table-ref/default *seen-cant-run-tests* testname 0) 1))) (define (runs:can-keep-running? testname n) (< (hash-table-ref/default *seen-cant-run-tests* testname 0) n)) (define (runs:can-run-more-tests jobgroup max-concurrent-jobs) (thread-sleep! (cond ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while (else 0))) (let* ((num-running (cdb:remote-run db:get-count-tests-running #f)) (num-running-in-jobgroup (cdb:remote-run db:get-count-tests-running-in-jobgroup #f jobgroup)) (job-group-limit (config-lookup *configdat* "jobgroups" jobgroup))) (if (> (+ num-running num-running-in-jobgroup) 0) (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1))) (if (not (eq? *last-num-running-tests* num-running)) (begin (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running) (set! *last-num-running-tests* num-running))) (if (not (eq? 0 *globalexitstatus*)) (list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) (let ((can-not-run-more (cond ;; if max-concurrent-jobs is set and the number running is greater ;; than it than cannot run more jobs ((and max-concurrent-jobs (>= num-running max-concurrent-jobs)) | > > > > > > > > > > > > | | | 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | ;; (define *seen-cant-run-tests* (make-hash-table)) ;; use to track tests that we suspect cannot be run (define (runs:inc-cant-run-tests testname) (hash-table-set! *seen-cant-run-tests* testname (+ (hash-table-ref/default *seen-cant-run-tests* testname 0) 1))) (define (runs:can-keep-running? testname n) (< (hash-table-ref/default *seen-cant-run-tests* testname 0) n)) (define *runs:denoise* (make-hash-table)) ;; key => last-time-ran (define (runs:lownoise key waitval) (let ((lasttime (hash-table-ref/default *runs:denoise* key 0)) (currtime (current-seconds))) (if (> (- currtime lasttime) waitval) (begin (hash-table-set! *runs:denoise* key currtime) #t) #f))) (define (runs:can-run-more-tests jobgroup max-concurrent-jobs) (thread-sleep! (cond ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while (else 0))) (let* ((num-running (cdb:remote-run db:get-count-tests-running #f)) (num-running-in-jobgroup (cdb:remote-run db:get-count-tests-running-in-jobgroup #f jobgroup)) (job-group-limit (config-lookup *configdat* "jobgroups" jobgroup))) (if (> (+ num-running num-running-in-jobgroup) 0) (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1))) (if (not (eq? *last-num-running-tests* num-running)) (begin (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running) (set! *last-num-running-tests* num-running))) (if (not (eq? 0 *globalexitstatus*)) (list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) (let ((can-not-run-more (cond ;; if max-concurrent-jobs is set and the number running is greater ;; than it than cannot run more jobs ((and max-concurrent-jobs (>= num-running max-concurrent-jobs)) (if (runs:lownoise "mcj msg" 60) (debug:print 0 "WARNING: Max running jobs exceeded, current number running: " num-running ", max_concurrent_jobs: " max-concurrent-jobs)) #t) ;; if job-group-limit is set and number of jobs in the group is greater ;; than the limit then cannot run more jobs of this kind ((and job-group-limit (>= num-running-in-jobgroup job-group-limit)) (debug:print 1 "WARNING: number of jobs " num-running-in-jobgroup " in " jobgroup " exceeded, will not run " (tests:testqueue-get-testname test-record)) |
︙ | ︙ | |||
351 352 353 354 355 356 357 | (define (runs:queue-next-reg tal reg n regfull) (if regfull (cdr reg) (if (null? tal) ;; if tal is null and reg not full then '() as reg contents moved to tal '() reg))) | | > > > > > > > > > > < < < | < < < < | < < | | | | | | | | | | | | | > | | | | | > > > > > > > > > > > | | 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 | (define (runs:queue-next-reg tal reg n regfull) (if regfull (cdr reg) (if (null? tal) ;; if tal is null and reg not full then '() as reg contents moved to tal '() reg))) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (mt:get-prereqs-not-met run-id waitons item-path mode: testmode)) (fails (runs:calc-fails prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met))) (debug:print-info 4 "START OF INNER COND #2 " "\n can-run-more: " can-run-more "\n testname: " hed "\n prereqs-not-met: " (runs:pretty-string prereqs-not-met) "\n non-completed: " (runs:pretty-string non-completed) "\n fails: " (runs:pretty-string fails) "\n testmode: " testmode "\n (eq? testmode 'toplevel): " (eq? testmode 'toplevel) "\n (null? non-completed): " (null? non-completed) "\n reruns: " reruns "\n items: " items "\n can-run-more: " can-run-more) (cond ;; all prereqs met, fire off the test ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch ((member (hash-table-ref/default test-registry (runs:make-full-test-name hed item-path) 'n/a) '(COMPLETED INCOMPLETE)) ;; try to catch repeat processing of COMPLETED tests here (debug:print-info 1 "Test " hed " is " (hash-table-ref test-registry (runs:make-full-test-name hed item-path)) ". Removing it from the queue") (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns)) ;; ((or (null? prereqs-not-met) (and (eq? testmode 'toplevel) (null? non-completed))) (debug:print-info 4 "runs:expand-items: (or (null? prereqs-not-met) (and (eq? testmode 'toplevel)(null? non-completed)))") (let ((test-name (tests:testqueue-get-testname test-record))) (setenv "MT_TEST_NAME" test-name) ;; (setenv "MT_RUNNAME" runname) (set-megatest-env-vars run-id inrunname: runname) ;; these may be needed by the launching process (let ((items-list (items:get-items-from-config tconfig))) (if (list? items-list) (begin (tests:testqueue-set-items! test-record items-list) (list hed tal reg reruns)) (begin (debug:print 0 "ERROR: The proc from reading the setup did not yield a list - please report this") (exit 1)))))) ((and (null? fails) (not (null? non-completed))) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal ;; but also lets check that the prerequisites are all in the newtal or reruns lists (let* ((allinqueue (map (lambda (x)(if (string? x) x (db:test-get-testname x))) (append newtal reruns))) (prereqstrs (map (lambda (x)(if (string? x) x (db:test-get-testname x))) prereqs-not-met)) (notinqueue (filter (lambda (x) (not (member x allinqueue))) prereqstrs))) (if (null? notinqueue) (if (runs:can-keep-running? hed 5) ;; try five times (begin (runs:inc-cant-run-tests hed) (list (car newtal)(append (cdr newtal) reg) '() reruns)) (begin (if (runs:lownoise (conc "no fails prereq, null notinqueue " hed) 30) (debug:print 1 "WARNING: test " hed " has no failed prerequisites but does have prerequistes that are NOT in the queue: " (string-intersperse notinqueue ", "))) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns))) ;; have prereqs in queue, keep going. (begin (if (runs:lownoise (conc "no fails prereq " hed) 30) (debug:print-info 1 "no fails in prerequisites for " hed ", waiting on tests; " (string-intersperse (map (lambda (x) (if (string? x) x (runs:make-full-test-name (db:test-get-testname x) (db:test-get-item-path x)))) non-completed) ", ") ". Delaying launch of " hed ".")) (list (car newtal)(append (cdr newtal) reg) '() reruns))))) ;; an issue with prereqs not yet met? ((and (null? fails) (null? non-completed)) (if (runs:can-keep-running? hed 5) (begin (runs:inc-cant-run-tests hed) (debug:print-info 1 "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) |
︙ | ︙ | |||
451 452 453 454 455 456 457 | ", removing it from to-do list") (if (or (not (null? reg))(not (null? tal))) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) (cons hed reruns)) #f)) ;; #f flags do not loop | | > > > | > > > | > > > > > > > > > > > | 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 | ", removing it from to-do list") (if (or (not (null? reg))(not (null? tal))) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) (cons hed reruns)) #f)) ;; #f flags do not loop ((and (not (null? fails))(eq? testmode 'toplevel)) (if (or (not (null? reg))(not (null? tal))) (list (car newtal)(append (cdr newtal) reg) '() reruns) #f)) (else (debug:print 1 "WARNING: FAILS or incomplete tests are preventing completion of this run. Dropping test " hed " from the run queue") (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns))))) ;; (list (car newtal)(cdr newtal) reg reruns))))) (define (runs:mixed-list-testname-and-testrec->list-of-strings inlst) (map (lambda (t) (cond ((vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t))) ((string? t) t) (else (conc t)))) inlst)) (define (runs:process-expanded-tests hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry) (let* ((run-limits-info (runs:can-run-more-tests jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (have-resources (car run-limits-info)) (num-running (list-ref run-limits-info 1)) (num-running-in-jobgroup (list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) |
︙ | ︙ | |||
476 477 478 479 480 481 482 | (map (lambda (t) (if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails) (if (not (null? prereqs-not-met)) | | | 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 | (map (lambda (t) (if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) (conc " WARNING: t is not a vector=" t ))) prereqs-not-met) ", ") ") fails: " fails) (if (not (null? prereqs-not-met)) (debug:print-info 1 "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch? (debug:print-info 4 "run-limits-info = " run-limits-info) (cond |
︙ | ︙ | |||
546 547 548 549 550 551 552 | ", ")) (thread-sleep! 0.1) (list hed tal reg reruns)) ;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second | > | | 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 | ", ")) (thread-sleep! 0.1) (list hed tal reg reruns)) ;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. (thread-sleep! 1) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test |
︙ | ︙ | |||
755 756 757 758 759 760 761 | ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (let ((can-run-more (runs:can-run-more-tests jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) | | | 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 | ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (let ((can-run-more (runs:can-run-more-tests jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry))) (if loop-list (apply loop loop-list))) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) |
︙ | ︙ | |||
968 969 970 971 972 973 974 | (if (> (- (current-seconds)(+ (db:test-get-event_time testdat) (db:test-get-run_duration testdat))) 600) ;; i.e. no update for more than 600 seconds (begin (debug:print 0 "WARNING: Test " test-name " appears to be dead. Forcing it to state INCOMPLETE and status STUCK/DEAD") (tests:test-set-status! test-id "INCOMPLETE" "STUCK/DEAD" "Test is stuck or dead" #f)) (debug:print 2 "NOTE: " test-name " is already running"))) | > | > | 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 | (if (> (- (current-seconds)(+ (db:test-get-event_time testdat) (db:test-get-run_duration testdat))) 600) ;; i.e. no update for more than 600 seconds (begin (debug:print 0 "WARNING: Test " test-name " appears to be dead. Forcing it to state INCOMPLETE and status STUCK/DEAD") (tests:test-set-status! test-id "INCOMPLETE" "STUCK/DEAD" "Test is stuck or dead" #f)) (debug:print 2 "NOTE: " test-name " is already running"))) (else (debug:print 0 "ERROR: Failed to launch test " full-test-name ". Unrecognised state " (test:get-state testdat)) (hash-table-set! test-registry (runs:make-full-test-name test-name test-path)(string->symbol (test:get-state testdat)))))))) ;;====================================================================== ;; END OF NEW STUFF ;;====================================================================== (define (get-dir-up-n dir . params) (let ((dparts (string-split dir "/")) |
︙ | ︙ |