Overview
Comment: | made progress - test2 deadlock situation is broken |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-test2 |
Files: | files | file ages | folders |
SHA1: |
624ddf95e6b54a943a644c0b5e785c99 |
User & Date: | bjbarcla on 2017-11-02 18:14:08 |
Other Links: | branch diff | manifest | tags |
Context
2017-11-02
| ||
18:38 | catch up with v1.65 tip check-in: 9f5f817c0e user: bjbarcla tags: v1.65-test2 | |
18:14 | made progress - test2 deadlock situation is broken check-in: 624ddf95e6 user: bjbarcla tags: v1.65-test2 | |
18:06 | made progress - test2 deadlock situation is broken check-in: 9a750784a3 user: bjbarcla tags: v1.65-test2 | |
Changes
Modified runs.scm from [b72555b9ec] to [0dfe3f4c47].
︙ | ︙ | |||
188 189 190 191 192 193 194 | (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) | | | 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move to cond clauses below where we determine we have too many jobs running rather than each time the and condition above is true (which seems like always)? ((> (runs:dat-can-run-more-tests-count runsdat) 20) (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.1) ;; was 2 );; obviously haven't had any work to do for a while (else 0))) (let* ((num-running (rmt:get-count-tests-running run-id)) |
︙ | ︙ | |||
636 637 638 639 640 641 642 | (list (runs:queue-next-hed tal reg reglen regfull) ;; hed (runs:queue-next-tal tal reg reglen regfull) ;; tal (runs:queue-next-reg tal reg reglen regfull) ;; reg reruns)) ;; reruns (define runs:nothing-left-in-queue-count 0) | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 636 637 638 639 640 641 642 643 644 645 646 647 648 649 | (list (runs:queue-next-hed tal reg reglen regfull) ;; hed (runs:queue-next-tal tal reg reglen regfull) ;; tal (runs:queue-next-reg tal reg reglen regfull) ;; reg reruns)) ;; reruns (define runs:nothing-left-in-queue-count 0) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))) (if (list? res) res (begin (debug:print 0 *default-log-port* |
︙ | ︙ | |||
706 707 708 709 710 711 712 | (cond ;; all prereqs met, fire off the test ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch ((and (not (member 'toplevel testmode)) (member (hash-table-ref/default test-registry (db:test-make-full-name hed item-path) 'n/a) '(DONOTRUN removed CANNOTRUN))) ;; *common:cant-run-states-sym*) ;; '(COMPLETED KILLED WAIVED UNKNOWN INCOMPLETE)) ;; try to catch repeat processing of COMPLETED tests here | | | | 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 | (cond ;; all prereqs met, fire off the test ;; or, if it is a 'toplevel test and all prereqs not met are COMPLETED then launch ((and (not (member 'toplevel testmode)) (member (hash-table-ref/default test-registry (db:test-make-full-name hed item-path) 'n/a) '(DONOTRUN removed CANNOTRUN))) ;; *common:cant-run-states-sym*) ;; '(COMPLETED KILLED WAIVED UNKNOWN INCOMPLETE)) ;; try to catch repeat processing of COMPLETED tests here (debug:print-info 4 *default-log-port* "cond branch - " "ei-1") (debug:print-info 1 *default-log-port* "Test " hed " set to \"" (hash-table-ref test-registry (db:test-make-full-name hed item-path)) "\". Removing it from the queue") (if (or (not (null? tal)) (not (null? reg))) (runs:loop-values tal reg reglen regfull reruns) (begin (debug:print-info 0 *default-log-port* "Nothing left in the queue!") ;; If get here twice then we know we've tried to expand all items ;; since there must be a logic issue with the handling of loops in the ;; items expand phase we will brute force an exit here. (if (> runs:nothing-left-in-queue-count 2) (begin (debug:print 0 *default-log-port* "WARNING: this condition is triggered when there were no items to expand and nothing to run. Please check your run for completeness") (exit 0)) (set! runs:nothing-left-in-queue-count (+ runs:nothing-left-in-queue-count 1))) #f))) ;; ((or (null? prereqs-not-met) (and (member 'toplevel testmode) (null? non-completed))) (debug:print-info 4 *default-log-port* "cond branch - " "ei-2") (debug:print-info 4 *default-log-port* "runs:expand-items: (or (null? prereqs-not-met) (and (member 'toplevel testmode)(null? non-completed)))") (let ((test-name (tests:testqueue-get-testname test-record))) (setenv "MT_TEST_NAME" test-name) ;; (setenv "MT_RUNNAME" runname) (runs:set-megatest-env-vars run-id inrunname: runname) ;; these may be needed by the launching process (let ((items-list (items:get-items-from-config tconfig))) (if (list? items-list) |
︙ | ︙ | |||
751 752 753 754 755 756 757 | (begin (debug:print-error 0 *default-log-port* "The proc from reading the items table did not yield a list - please report this") (exit 1)))))) ((and (null? fails) (null? prereq-fails) (not (null? non-completed))) | | | 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 | (begin (debug:print-error 0 *default-log-port* "The proc from reading the items table did not yield a list - please report this") (exit 1)))))) ((and (null? fails) (null? prereq-fails) (not (null? non-completed))) (debug:print-info 4 *default-log-port* "cond branch - " "ei-3") (let* ((allinqueue (map (lambda (x)(if (string? x) x (db:test-get-testname x))) (append newtal reruns))) ;; prereqstrs is a list of test names as strings that are prereqs for hed (prereqstrs (delete-duplicates (map (lambda (x)(if (string? x) x (db:test-get-testname x))) prereqs-not-met))) ;; a prereq that is not found in allinqueue will be put in the notinqueue list ;; |
︙ | ︙ | |||
792 793 794 795 796 797 798 | (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns) )) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) | | | | | | | 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 | (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns) )) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while."))) (runs:loop-values tal reg reglen regfull reruns) ))) ((and (or (not (null? fails)) (not (null? prereq-fails))) (member 'normal testmode)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-5") (debug:print-info 1 *default-log-port* "test " hed " (mode=" testmode ") has failed prerequisite(s); " (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") ", removing it from to-do list") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (if (not (null? prereq-fails)) (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites") (begin (debug:print 4 *default-log-port*"BB> set PREQ_FAIL on "hed) (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))))) ;; BB: this works, btu equivalent for itemwait mode does not work. (if (or (not (null? reg))(not (null? tal))) (begin (hash-table-set! test-registry hed 'CANNOTRUN) (runs:loop-values tal reg reglen regfull (cons hed reruns)) ) #f)) ;; #f flags do not loop ((and (not (null? fails))(member 'toplevel testmode)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-6") (if (or (not (null? reg))(not (null? tal))) (list (car newtal)(append (cdr newtal) reg) '() reruns) #f)) ((null? runnables) (debug:print-info 4 *default-log-port* "cond branch - " "ei-7") #f) ;; if we get here and non-completed is null then it is all over. (else (debug:print-info 4 *default-log-port* "cond branch - " "ei-8") (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now") (list (car newtal)(cdr newtal) reg reruns))))) (define (runs:mixed-list-testname-and-testrec->list-of-strings inlst) (if (null? inlst) '() (map (lambda (t) |
︙ | ︙ | |||
1222 1223 1224 1225 1226 1227 1228 | (debug:print 5 *default-log-port* "test-records: " test-records ", flags: " (hash-table->alist flags)) ;; Do mark-and-find clean up of db before starting runing of quue ;; ;; (rmt:find-and-mark-incomplete) (let* ((run-info (rmt:get-run-info run-id)) | < | 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 | (debug:print 5 *default-log-port* "test-records: " test-records ", flags: " (hash-table->alist flags)) ;; Do mark-and-find clean up of db before starting runing of quue ;; ;; (rmt:find-and-mark-incomplete) (let* ((run-info (rmt:get-run-info run-id)) (tests-info (mt:get-tests-for-run run-id #f '() '())) ;; qryvals: "id,testname,item_path")) (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) (test-registry (make-hash-table)) (registry-mutex (make-mutex)) (num-retries 0) (max-retries (config-lookup *configdat* "setup" "maxretries")) (max-concurrent-jobs (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50)) |
︙ | ︙ | |||
1277 1278 1279 1280 1281 1282 1283 | (st (db:test-get-state trec))) (if (not (equal? st "DELETED")) (hash-table-set! test-registry (db:test-make-full-name tn ip) (string->symbol st))))) tests-info) (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100)) (let loop ((hed (car sorted-test-names)) | < | 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 | (st (db:test-get-state trec))) (if (not (equal? st "DELETED")) (hash-table-set! test-registry (db:test-make-full-name tn ip) (string->symbol st))))) tests-info) (set! max-retries (if (and max-retries (string->number max-retries))(string->number max-retries) 100)) (let loop ((hed (car sorted-test-names)) (tal (cdr sorted-test-names)) (reg '()) ;; registered, put these at the head of tal (reruns '())) (runs:incremental-print-results run-id) |
︙ | ︙ | |||
1305 1306 1307 1308 1309 1310 1311 | ;; (if (> (current-seconds)(+ last-time-incomplete 900)) (begin (set! last-time-incomplete (current-seconds)) ;; (rmt:find-and-mark-incomplete-all-runs) )) | < < < < | 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 | ;; (if (> (current-seconds)(+ last-time-incomplete 900)) (begin (set! last-time-incomplete (current-seconds)) ;; (rmt:find-and-mark-incomplete-all-runs) )) ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) (jobgroup (config-lookup tconfig "test_meta" "jobgroup")) (testmode (let ((m (config-lookup tconfig "requirements" "mode"))) (if m (map string->symbol (string-split m)) '(normal)))) |
︙ | ︙ | |||
1420 1421 1422 1423 1424 1425 1426 | number? (map (lambda (waiton) (if (and (not (member waiton tal)) ;; this waiton is not in the list to be tried to run (not (member waiton reruns))) 1 #f)) waitons))))) ;; could do this more elegantly with a marker.... | | | | | 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 | number? (map (lambda (waiton) (if (and (not (member waiton tal)) ;; this waiton is not in the list to be tried to run (not (member waiton reruns))) 1 #f)) waitons))))) ;; could do this more elegantly with a marker.... (debug:print-info 4 *default-log-port* "cond branch - " "rtq-1") (debug:print 0 *default-log-port* "WARNING: Marking test " tfullname " as not runnable. It is waiting on tests that cannot be run. Giving up now.") (hash-table-set! test-registry tfullname 'removed)) ;; items is #f then the test is ok to be handed off to launch (but not before) ;; ((not items) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-2") (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (let ((loop-list (runs:process-expanded-tests runsdat testdat))) (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated (not itemdat)) ;; and not yet expanded into the list of things to be done (debug:print-info 4 *default-log-port* "cond branch - " "rtq-3") (debug:print-info 4 *default-log-port* "OUTER COND: (and (list? items)(not itemdat))") ;; Must determine if the items list is valid. Discard the test if it is not. (if (and (list? items) (> (length items) 0) (and (list? (car items)) (> (length (car items)) 0)) (debug:debug-mode 1)) |
︙ | ︙ | |||
1464 1465 1466 1467 1468 1469 1470 | (let* ((items-in-testpatt (filter (lambda (my-itemdat) (tests:match test-patts hed (item-list->path my-itemdat) required: required-tests)) items) )) (if (null? items-in-testpatt) | < < | > | | < | | | | | | | | 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 | (let* ((items-in-testpatt (filter (lambda (my-itemdat) (tests:match test-patts hed (item-list->path my-itemdat) required: required-tests)) items) )) (if (null? items-in-testpatt) (let ((test-id (rmt:get-test-id run-id test-name ""))) (debug:print-info 0 *default-log-port* "Test " (tests:testqueue-get-testname test-record) " is itemized but has no items matching test pattern -- marking status ZERO_ITEMS") (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" "This test has no items which match test pattern."))) (for-each (lambda (my-itemdat) (let* ((new-test-record (let ((newrec (make-tests:testqueue))) (vector-copy! test-record newrec) newrec)) (my-item-path (item-list->path my-itemdat)) (newtestname (db:test-make-full-name hed my-item-path))) ;; test names are unique on testname/item-path (tests:testqueue-set-items! new-test-record #f) (tests:testqueue-set-itemdat! new-test-record my-itemdat) (tests:testqueue-set-item_path! new-test-record my-item-path) (hash-table-set! test-records newtestname new-test-record) (set! tal (append tal (list newtestname))))) ;; since these are itemized create new test names testname/itempath items-in-testpatt))) ;; At this point we have possibly added items to tal but all must be handed off to ;; INNER COND logic. I think loop without rotating the queue ;; (loop hed tal reg reruns)) ;; (let ((newtal (append tal (list hed)))) ;; We should discard hed as it has been expanded into it's items? Yes, but only if this *is* an itemized test ;; (loop (car newtal)(cdr newtal) reg reruns) (if (null? tal) #f (loop (car tal)(cdr tal) reg reruns))) ;; if items is a proc then need to run items:get-items-from-config, get the list and loop ;; - but only do that if resources exist to kick off the job ;; EXPAND ITEMS ((or (procedure? items)(eq? items 'have-procedure)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-4") (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) (if loop-list (apply loop loop-list) (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed) ) ) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-5") (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this") (exit 1)) ((not (null? reruns)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-6") (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED, (junked (lset-difference equal? tal newlst))) (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal) (if (< num-retries max-retries) (set! newlst (append reruns newlst))) (set! num-retries (+ num-retries 1)) ;; (thread-sleep! (+ 1 *global-delta*)) (if (not (null? newlst)) ;; since reruns have been tacked on to newlst create new reruns from junked (loop (car newlst)(cdr newlst) reg (delete-duplicates junked))))) ((not (null? tal)) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-7") (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here.")) ((not (null? reg)) ;; could we get here with leftovers? (debug:print-info 4 *default-log-port* "cond branch - " "rtq-8") (debug:print-info 0 *default-log-port* "Have leftovers!") (loop (car reg)(cdr reg) '() reruns)) (else (debug:print-info 4 *default-log-port* "cond branch - " "rtq-9") (debug:print-info 4 *default-log-port* "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 5) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) |
︙ | ︙ |