Overview
Comment: | Added adaptive delay to prevent hammering the db when tests are running but none are ready for launch |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
fdbfca436760012a652611eac324264b |
User & Date: | matt on 2013-01-30 22:01:38 |
Other Links: | manifest | tags |
Context
2013-01-30
| ||
22:04 | Added mockup for cached writes check-in: 4309310007 user: mrwellan tags: trunk | |
22:01 | Added adaptive delay to prevent hammering the db when tests are running but none are ready for launch check-in: fdbfca4367 user: matt tags: trunk | |
14:53 | Added help for -list-targets, -list-disks and -list-db-targets check-in: 9fb0a7f622 user: mrwellan tags: trunk | |
Changes
Modified runs.scm from [fcf8110ccc] to [1837f1ac5c].
︙ | ︙ | |||
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | (define (set-item-env-vars itemdat) (for-each (lambda (item) (debug:print 2 "setenv " (car item) " " (cadr item)) (setenv (car item) (cadr item))) itemdat)) (define *last-num-running-tests* 0) (define (runs:can-run-more-tests test-record) (let* ((tconfig (tests:testqueue-get-testconfig test-record)) (jobgroup (config-lookup tconfig "requirements" "jobgroup")) (num-running (cdb:remote-run db:get-count-tests-running #f)) (num-running-in-jobgroup (cdb:remote-run db:get-count-tests-running-in-jobgroup #f jobgroup)) (max-concurrent-jobs (let ((mcj (config-lookup *configdat* "setup" "max_concurrent_jobs"))) (if (and mcj (string->number mcj)) (string->number mcj) 1))) (job-group-limit (config-lookup *configdat* "jobgroups" jobgroup))) (if (not (eq? *last-num-running-tests* num-running)) (begin (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running) (set! *last-num-running-tests* num-running))) (if (not (eq? 0 *globalexitstatus*)) (list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) (let ((can-not-run-more (cond | > > > > > > > > > > > > > > | 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | (define (set-item-env-vars itemdat) (for-each (lambda (item) (debug:print 2 "setenv " (car item) " " (cadr item)) (setenv (car item) (cadr item))) itemdat)) (define *last-num-running-tests* 0) (define *runs:can-run-more-tests-delay* 0) (define (runs:shrink-can-run-more-tests-delay) (set! *runs:can-run-more-tests-delay* (/ *runs:can-run-more-tests-delay* 2))) (define (runs:can-run-more-tests test-record) (thread-sleep! *runs:can-run-more-tests-delay*) (let* ((tconfig (tests:testqueue-get-testconfig test-record)) (jobgroup (config-lookup tconfig "requirements" "jobgroup")) ;; Heuristic fix. These are getting called too rapidly when jobs are running or stuck ;; so we are going to increment a global delay by 0.1 seconds up to 10 seconds ;; every time runs:can-run-more-tests is called. ;; when a test is launched or other activity occurs divide the delay by 2 (num-running (cdb:remote-run db:get-count-tests-running #f)) (num-running-in-jobgroup (cdb:remote-run db:get-count-tests-running-in-jobgroup #f jobgroup)) (max-concurrent-jobs (let ((mcj (config-lookup *configdat* "setup" "max_concurrent_jobs"))) (if (and mcj (string->number mcj)) (string->number mcj) 1))) (job-group-limit (config-lookup *configdat* "jobgroups" jobgroup))) (if (and (> (+ num-running num-running-in-jobgroup) 0) (< *runs:can-run-more-tests-delay* 10)) (begin (set! *runs:can-run-more-tests-delay* (+ *runs:can-run-more-tests-delay* 0.1)) (debug:print-info 14 "can-run-more-tests-delay: " *runs:can-run-more-tests-delay*))) (if (not (eq? *last-num-running-tests* num-running)) (begin (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running) (set! *last-num-running-tests* num-running))) (if (not (eq? 0 *globalexitstatus*)) (list #f num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) (let ((can-not-run-more (cond |
︙ | ︙ | |||
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | ( ;; (and (not (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f)) ;; (and max-concurrent-jobs (> (- max-concurrent-jobs num-running) 5))) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) (open-run-close db:tests-register-test #f run-id test-name item-path) (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t) (thread-sleep! *global-delta*) (loop (car newtal)(cdr newtal) reruns)) ((not have-resources) ;; simply try again after waiting a second (debug:print-info 1 "no resources to run new tests, waiting ...") (thread-sleep! (+ 2 *global-delta*)) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (loop (car newtal)(cdr newtal) reruns)) ((and have-resources (or (null? prereqs-not-met) (and (eq? testmode 'toplevel) (null? non-completed)))) (run:test run-id runname keyvallst test-record flags #f) (thread-sleep! *global-delta*) (if (not (null? tal)) (loop (car tal)(cdr tal) reruns))) (else ;; must be we have unmet prerequisites (debug:print 4 "FAILS: " fails) ;; If one or more of the prereqs-not-met are FAIL then we can issue ;; a message and drop hed from the items to be processed. (if (null? fails) (begin ;; couldn't run, take a breather (debug:print-info 4 "Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...") (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient ;; we made new tal by sticking hed at the back of the list (loop (car newtal)(cdr newtal) reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (if (not (null? tal)) (if (vector? hed) (begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) " from the launch list as it has prerequistes that are FAIL") (thread-sleep! *global-delta*) (loop (car tal)(cdr tal) (cons hed reruns))) (begin (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") (thread-sleep! (+ 0.01 *global-delta*)) (loop hed tal reruns))))))))) ;; END OF INNER COND ;; case where an items came in as a list been processed ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1) | > > > > | 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 | ( ;; (and (not (hash-table-ref/default test-registery (runs:make-full-test-name test-name item-path) #f)) ;; (and max-concurrent-jobs (> (- max-concurrent-jobs num-running) 5))) (debug:print-info 4 "Pre-registering test " test-name "/" item-path " to create placeholder" ) (open-run-close db:tests-register-test #f run-id test-name item-path) (hash-table-set! test-registery (runs:make-full-test-name test-name item-path) #t) (thread-sleep! *global-delta*) (runs:shrink-can-run-more-tests-delay) (loop (car newtal)(cdr newtal) reruns)) ((not have-resources) ;; simply try again after waiting a second (debug:print-info 1 "no resources to run new tests, waiting ...") (thread-sleep! (+ 2 *global-delta*)) ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (loop (car newtal)(cdr newtal) reruns)) ((and have-resources (or (null? prereqs-not-met) (and (eq? testmode 'toplevel) (null? non-completed)))) (run:test run-id runname keyvallst test-record flags #f) (runs:shrink-can-run-more-tests-delay) (thread-sleep! *global-delta*) (if (not (null? tal)) (loop (car tal)(cdr tal) reruns))) (else ;; must be we have unmet prerequisites (debug:print 4 "FAILS: " fails) ;; If one or more of the prereqs-not-met are FAIL then we can issue ;; a message and drop hed from the items to be processed. (if (null? fails) (begin ;; couldn't run, take a breather (debug:print-info 4 "Shouldn't really get here, race condition? Unable to launch more tests at this moment, killing time ...") (thread-sleep! (+ 0.01 *global-delta*)) ;; long sleep here - no resources, may as well be patient ;; we made new tal by sticking hed at the back of the list (loop (car newtal)(cdr newtal) reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (if (not (null? tal)) (if (vector? hed) (begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) " from the launch list as it has prerequistes that are FAIL") (runs:shrink-can-run-more-tests-delay) (thread-sleep! *global-delta*) (loop (car tal)(cdr tal) (cons hed reruns))) (begin (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") (runs:shrink-can-run-more-tests-delay) (thread-sleep! (+ 0.01 *global-delta*)) (loop hed tal reruns))))))))) ;; END OF INNER COND ;; case where an items came in as a list been processed ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1) |
︙ | ︙ |