Overview
Comment: | Added filter to skip COMPLETED tests on startup. Move code to query db for checking if ok to run more tests to server side |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | dev |
Files: | files | file ages | folders |
SHA1: |
290d5d298fff3ca3a29016a0df86f0bd |
User & Date: | mrwellan on 2013-06-07 22:03:08 |
Other Links: | branch diff | manifest | tags |
Context
2013-06-18
| ||
00:45 | Tweaks for debugging stuck launch queue issues check-in: 9243ce728a user: matt tags: dev | |
2013-06-07
| ||
22:07 | Merged development into v1.5501 as it diverges more than a patch release should diverge. check-in: fd2b134daa user: mrwellan tags: v1.55 | |
22:03 | Added filter to skip COMPLETED tests on startup. Move code to query db for checking if ok to run more tests to server side check-in: 290d5d298f user: mrwellan tags: dev | |
13:55 | This version successfully ran stdcell tests (200 parallel jobs, 6000 total jobs) but it has problems with queue management check-in: a5d4098b27 user: mrwellan tags: dev | |
Changes
Modified megatest-version.scm from [19b1c1747c] to [252b48ae3a].
1 2 3 4 5 | ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) | | | 1 2 3 4 5 6 7 | ;; Always use two digit decimal ;; 1.01, 1.02...1.10,1.11 ... 1.99,2.00.. (declare (unit megatest-version)) (define megatest-version 1.5431) |
Modified run-tests-queue-classic.scm from [4c9cbe32ec] to [78c0dba7f7].
︙ | ︙ | |||
20 21 22 23 24 25 26 27 28 29 30 31 32 33 | (tal (cdr sorted-test-names)) (reruns '())) (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns)) ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) (testmode (let ((m (config-lookup tconfig "requirements" "mode"))) (if m (string->symbol m) 'normal))) (waitons (tests:testqueue-get-waitons test-record)) (priority (tests:testqueue-get-priority test-record)) (itemdat (tests:testqueue-get-itemdat test-record)) ;; itemdat can be a string, list or #f (items (tests:testqueue-get-items test-record)) (item-path (item-list->path itemdat)) | > | 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | (tal (cdr sorted-test-names)) (reruns '())) (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns)) ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) (jobgroup (config-lookup tconfig "requirements" "jobgroup")) (testmode (let ((m (config-lookup tconfig "requirements" "mode"))) (if m (string->symbol m) 'normal))) (waitons (tests:testqueue-get-waitons test-record)) (priority (tests:testqueue-get-priority test-record)) (itemdat (tests:testqueue-get-itemdat test-record)) ;; itemdat can be a string, list or #f (items (tests:testqueue-get-items test-record)) (item-path (item-list->path itemdat)) |
︙ | ︙ | |||
52 53 54 55 56 57 58 | (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons)))) (cond ;; OUTER COND ((not items) ;; when false the test is ok to be handed off to launch (but not before) (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car newtal)(cdr newtal) reruns)) | > | | 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons)))) (cond ;; OUTER COND ((not items) ;; when false the test is ok to be handed off to launch (but not before) (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car newtal)(cdr newtal) reruns)) (let* ((run-limits-info ;; (cdb:remote-run runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (open-run-close runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (have-resources (car run-limits-info)) (num-running (list-ref run-limits-info 1)) (num-running-in-jobgroup (list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) (job-group-limit (list-ref run-limits-info 4)) (prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) (fails (runs:calc-fails prereqs-not-met)) |
︙ | ︙ | |||
189 190 191 192 193 194 195 | (debug:print-info 4 "End of items list, looping with next after short delay") ;; (thread-sleep! (+ 0.01 *global-delta*)) (loop (car tal)(cdr tal) reruns)))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ((or (procedure? items)(eq? items 'have-procedure)) | | > | 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | (debug:print-info 4 "End of items list, looping with next after short delay") ;; (thread-sleep! (+ 0.01 *global-delta*)) (loop (car tal)(cdr tal) reruns)))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ((or (procedure? items)(eq? items 'have-procedure)) (let ((can-run-more ;; (cdb:remote-run runs:can-run-more-tests #f jobgroup max-concurrent-jobs))) (open-run-close runs:can-run-more-tests #f jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let* ((prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) (fails (runs:calc-fails prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met))) (debug:print-info 8 "can-run-more: " can-run-more "\n testname: " hed |
︙ | ︙ |
Modified run-tests-queue-new.scm from [bdb0e5926d] to [b23b3c860e].
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > (define (runs:run-tests-queue-new run-id runname test-records keyvals flags test-patts required-tests reglen) ;; At this point the list of parent tests is expanded ;; NB// Should expand items here and then insert into the run queue. (debug:print 5 "test-records: " test-records ", flags: " (hash-table->alist flags)) (let ((run-info (cdb:remote-run db:get-run-info #f run-id)) (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) (test-registry (make-hash-table)) (registry-mutex (make-mutex)) (num-retries 0) (max-retries (config-lookup *configdat* "setup" "maxretries")) (max-concurrent-jobs (let ((mcj (config-lookup *configdat* "setup" "max_concurrent_jobs"))) (if (and mcj (string->number mcj)) (string->number mcj) 1)))) ;; length of the register queue ahead (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100)) (if (not (null? sorted-test-names)) (let loop ((hed (car sorted-test-names)) (tal (cdr sorted-test-names)) (reg '()) ;; registered, put these at the head of tal (reruns '())) (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns)) ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) (testmode (let ((m (config-lookup tconfig "requirements" "mode"))) (if m (string->symbol m) 'normal))) (waitons (tests:testqueue-get-waitons test-record)) (priority (tests:testqueue-get-priority test-record)) (itemdat (tests:testqueue-get-itemdat test-record)) ;; itemdat can be a string, list or #f (items (tests:testqueue-get-items test-record)) (item-path (item-list->path itemdat)) (newtal (append tal (list hed))) (regfull (> (length reg) reglen))) ;; (if (> (length reg) 10) ;; (begin ;; (set! tal (cons hed tal)) ;; (set! hed (car reg)) ;; (set! reg (cdr reg)) ;; (set! newtal tal))) (debug:print 6 | > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | ;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... > (define (runs:run-tests-queue-new run-id runname test-records keyvals flags test-patts required-tests reglen) ;; At this point the list of parent tests is expanded ;; NB// Should expand items here and then insert into the run queue. (debug:print 5 "test-records: " test-records ", flags: " (hash-table->alist flags)) (let ((run-info (cdb:remote-run db:get-run-info #f run-id)) (tests-info (cdb:remote-run db:get-tests-for-run #f run-id #f '() '())) ;; qryvals: "id,testname,item_path")) (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) (test-registry (make-hash-table)) (registry-mutex (make-mutex)) (num-retries 0) (max-retries (config-lookup *configdat* "setup" "maxretries")) (max-concurrent-jobs (let ((mcj (config-lookup *configdat* "setup" "max_concurrent_jobs"))) (if (and mcj (string->number mcj)) (string->number mcj) 1)))) ;; length of the register queue ahead ;; Initialize the test-registery hash with tests that already have a record (for-each (lambda (trec) (let ((id (db:test-get-id trec)) (tn (db:test-get-testname trec)) (ip (db:test-get-item-path trec)) (st (db:test-get-state trec))) (hash-table-set! test-registry (runs:make-full-test-name tn ip) (string->symbol st)))) tests-info) (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100)) (if (not (null? sorted-test-names)) (let loop ((hed (car sorted-test-names)) (tal (cdr sorted-test-names)) (reg '()) ;; registered, put these at the head of tal (reruns '())) (if (not (null? reruns))(debug:print-info 4 "reruns=" reruns)) ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) (jobgroup (config-lookup tconfig "requirements" "jobgroup")) (testmode (let ((m (config-lookup tconfig "requirements" "mode"))) (if m (string->symbol m) 'normal))) (waitons (tests:testqueue-get-waitons test-record)) (priority (tests:testqueue-get-priority test-record)) (itemdat (tests:testqueue-get-itemdat test-record)) ;; itemdat can be a string, list or #f (items (tests:testqueue-get-items test-record)) (item-path (item-list->path itemdat)) (tfullname (runs:make-full-test-name test-name item-path)) (newtal (append tal (list hed))) (regfull (> (length reg) reglen))) ;; Fast skip of tests that are already "COMPLETED" (if (equal? (hash-table-ref/default test-registry tfullname #f) 'COMPLETED) (begin (debug:print-info 0 "Skipping COMPLETED test " tfullname) (if (not (null? tal)) (loop (car tal)(cdr tal) reg reruns)))) ;; (if (> (length reg) 10) ;; (begin ;; (set! tal (cons hed tal)) ;; (set! hed (car reg)) ;; (set! reg (cdr reg)) ;; (set! newtal tal))) (debug:print 6 |
︙ | ︙ | |||
59 60 61 62 63 64 65 | (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons)))) (cond ;; OUTER COND ((not items) ;; when false the test is ok to be handed off to launch (but not before) (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) | > | | 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | (set! waiton (filter (lambda (x)(not (equal? x hed))) waitons)))) (cond ;; OUTER COND ((not items) ;; when false the test is ok to be handed off to launch (but not before) (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) (let* ((run-limits-info (cdb:remote-run runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running ;; (open-run-close runs:can-run-more-tests #f jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (have-resources (car run-limits-info)) (num-running (list-ref run-limits-info 1)) (num-running-in-jobgroup (list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) (job-group-limit (list-ref run-limits-info 4)) (prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) (fails (runs:calc-fails prereqs-not-met)) |
︙ | ︙ | |||
114 115 116 117 118 119 120 | (cdb:tests-register-test *runremote* run-id test-name "")) (cdb:tests-register-test *runremote* run-id test-name item-path) (mutex-lock! registry-mutex) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) (mutex-unlock! registry-mutex)) (conc test-name "/" item-path)))) (thread-start! th)) | | | 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | (cdb:tests-register-test *runremote* run-id test-name "")) (cdb:tests-register-test *runremote* run-id test-name item-path) (mutex-lock! registry-mutex) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'done) (mutex-unlock! registry-mutex)) (conc test-name "/" item-path)))) (thread-start! th)) (cdb:remote-run runs:shrink-can-run-more-tests-count #f) ;; DELAY TWEAKER (still needed?) (if (and (null? tal)(null? reg)) (loop hed tal reg reruns) (loop (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (let ((newl (append reg (list hed)))) (if regfull (cdr newl) |
︙ | ︙ | |||
147 148 149 150 151 152 153 | (loop (car newtal)(cdr newtal) reg reruns)) ((and have-resources (or (null? prereqs-not-met) (and (eq? testmode 'toplevel) (null? non-completed)))) (run:test run-id run-info keyvals runname test-record flags #f) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'running) | | | 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | (loop (car newtal)(cdr newtal) reg reruns)) ((and have-resources (or (null? prereqs-not-met) (and (eq? testmode 'toplevel) (null? non-completed)))) (run:test run-id run-info keyvals runname test-record flags #f) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'running) (cdb:remote-run runs:shrink-can-run-more-tests-count #f) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (if (not (null? tal)) (loop (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns))) (else ;; must be we have unmet prerequisites |
︙ | ︙ | |||
171 172 173 174 175 176 177 | (loop (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (if (not (null? tal)) (if (vector? hed) (begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) " from the launch list as it has prerequistes that are FAIL") | | | | 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | (loop (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (if (not (null? tal)) (if (vector? hed) (begin (debug:print 1 "WARN: Dropping test " (db:test-get-testname hed) "/" (db:test-get-item-path hed) " from the launch list as it has prerequistes that are FAIL") (cdb:remote-run runs:shrink-can-run-more-tests-count #f) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (hash-table-set! test-registry (runs:make-full-test-name test-name item-path) 'removed) (loop (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) (cons hed reruns))) (begin (debug:print 1 "WARN: Test not processed correctly. Could be a race condition in your test implementation? " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") (cdb:remote-run runs:shrink-can-run-more-tests-count #f) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! (+ 0.01 *global-delta*)) (loop hed tal reg reruns))))))))) ;; END OF INNER COND ;; case where an items came in as a list been processed ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (if (and (debug:debug-mode 1) ;; (>= *verbosity* 1) |
︙ | ︙ | |||
217 218 219 220 221 222 223 | (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns)))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ((or (procedure? items)(eq? items 'have-procedure)) | | | 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns)))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ((or (procedure? items)(eq? items 'have-procedure)) (let ((can-run-more (cdb:remote-run runs:can-run-more-tests #f jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let* ((prereqs-not-met (db:get-prereqs-not-met run-id waitons item-path mode: testmode)) (fails (runs:calc-fails prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met))) (debug:print-info 8 "can-run-more: " can-run-more "\n testname: " hed |
︙ | ︙ |
Modified runs.scm from [513fa82301] to [d409eb1269].
︙ | ︙ | |||
161 162 163 164 165 166 167 | (define (set-item-env-vars itemdat) (for-each (lambda (item) (debug:print 2 "setenv " (car item) " " (cadr item)) (setenv (car item) (cadr item))) itemdat)) | < < | > > > | | < < | | | 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 | (define (set-item-env-vars itemdat) (for-each (lambda (item) (debug:print 2 "setenv " (car item) " " (cadr item)) (setenv (car item) (cadr item))) itemdat)) ;; Every time can-run-more-tests is called increment the delay ;; ;; NOTE: We run this server-side!! Do not use this global except in the runs:can-run-more-tests routine ;; (define *last-num-running-tests* 0) (define *runs:can-run-more-tests-count* 0) (define (runs:shrink-can-run-more-tests-count db) ;; the db is just so we can use cdb:remote-run (set! *runs:can-run-more-tests-count* 0)) ;; (/ *runs:can-run-more-tests-count* 2))) (define (runs:can-run-more-tests db jobgroup max-concurrent-jobs) (thread-sleep! (cond ((> *runs:can-run-more-tests-count* 20) 2);; obviously haven't had any work to do for a while (else 0))) (let* ((num-running (db:get-count-tests-running db)) (num-running-in-jobgroup (db:get-count-tests-running-in-jobgroup db jobgroup)) (job-group-limit (config-lookup *configdat* "jobgroups" jobgroup))) (if (> (+ num-running num-running-in-jobgroup) 0) (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1))) (if (not (eq? *last-num-running-tests* num-running)) (begin (debug:print 2 "max-concurrent-jobs: " max-concurrent-jobs ", num-running: " num-running) (set! *last-num-running-tests* num-running))) |
︙ | ︙ | |||
612 613 614 615 616 617 618 | (real-dir (if (file-exists? run-dir) (resolve-pathname run-dir) #f)) (test-state (db:test-get-state new-test-dat)) (test-fulln (db:test-get-fullname new-test-dat))) (case action ((remove-runs) | | | 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 | (real-dir (if (file-exists? run-dir) (resolve-pathname run-dir) #f)) (test-state (db:test-get-state new-test-dat)) (test-fulln (db:test-get-fullname new-test-dat))) (case action ((remove-runs) (debug:print-info 0 "test: " test-name " itest-state: " test-state) (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ")) (begin (if (not (hash-table-ref/default test-retry-time test-fulln #f)) (hash-table-set! test-retry-time test-fulln (current-seconds))) (if (> (- (current-seconds)(hash-table-ref test-retry-time test-fulln)) allow-run-time) ;; This test is not in a correct state for cleaning up. Let's try some graceful shutdown steps first ;; Set the test to "KILLREQ" and wait five seconds then try again. Repeat up to five times then give |
︙ | ︙ |