Megatest: Diff

Differences From Artifact [9309455d23]:

File runs.scm — part of check-in [78c9e0e0c5] at 2019-09-23 16:14:29 on branch trunk — Escape the period in the whitespace regex for env vars (user: mrwellan, size: 142373) [annotate] [blame] [check-ins using]

To Artifact [81684437ca]:

File runs.scm — part of check-in [8ec43965a7] at 2019-11-13 17:16:12 on branch v1.65 — Added skip script capability in testconfig. Skips if script returns status 0 (user: mmgraham, size: 142941) [annotate] [blame] [check-ins using] [more...]

︙
244 245 246 247 248 249 250 ~~251~~ 252 253 254 255 256 257 258	244 245 246 247 248 249 250 251 252 253 254 255 256 257 258	- +	(if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 default-log-port "waiting for tasks to complete, sleeping briefly ...")) (configf:lookup-number configdat "setup" "inter-test-delay" default: 0.1) ;; was 2 );; obviously haven't had any work to do for a while (else 0))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) ~~(job-group-limit (let ((jobg-count (config-lookup configdat "jobgroups" jobgroup)))~~ (job-group-limit (let ((jobg-count (configf:lookup configdat "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) (runs:inc-can-run-more-tests-count runsdat)) ;; (set! runs:can-run-more-tests-count (+ runs:can-run-more-tests-count 1))) (if (not (eq? last-num-running-tests num-running)) (begin
︙
384 385 386 387 388 389 390 ~~391~~ 392 393 394 395 396 397 398	384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399	- + +	(test-names #f) ;; Generated by a call to (tests:filter-test-names all-test-names test-patts)) (required-tests #f) ;; Put fully qualified test/testpath names in this list to be done (waitors-upon (make-hash-table)) ;; given a test, return list of tests waiting upon this test. (task-key (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id))) ;; (tdbdat (tasks:open-db)) (config-reruns (let ((x (configf:lookup configdat "setup" "reruns"))) (if x (string->number x) #f))) ~~(allowed-tests #f))~~ (allowed-tests #f) (runconf #f)) ;; check if readonly (when readonly-mode (debug:print-error 0 default-log-port "megatest.db is readonly. Cannot proceed.") (exit 1)) ;; per user request. If less than 100Meg space on dbdir partition, bail out with error
︙
544 545 546 547 548 549 550 ~~551~~ 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 ~~572~~ 573 574 575 576 577 578 579	545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580	- + - +	(change-directory toppath) ;; PLEASE OPTIMIZE ME!!! I think this should be a no-op but there are several places where change-directories could be happening. (setenv "MT_TEST_NAME" hed) ;; (let-values (((waitons waitors config) (tests:get-waitons hed all-tests-registry)) ;; NOTE: Have the config - can extract [waitons] section ((hed-mode) ~~(let ((m (config-lookup config "requirements" "mode")))~~ (let ((m (configf:lookup config "requirements" "mode"))) (if m (map string->symbol (string-split m)) '(normal)))) ((hed-itemized-waiton) ;; are items in hed waiting on items of waiton? (not (null? (lset-intersection eq? hed-mode '(itemmatch itemwait))))) ) (debug:print-info 8 default-log-port* "waitons: " waitons) ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (or (member hed waitons) (member hed waitors)) (begin (debug:print-error 0 default-log-port "test " hed " has listed itself as a waiton or waitor, please correct this!") (set! waitons (filter (lambda (x)(not (equal? x hed))) waitons)) (set! waitors (filter (lambda (x)(not (equal? x hed))) waitors)))) ;; (items (items:get-items-from-config config))) (if (not (hash-table-ref/default test-records hed #f)) ;; waiton-tconfig below will be #f until that test is visted here at least once (hash-table-set! test-records ;; BB: we are doing a manual make-tests:testqueue hed (vector hed ;; 0 ;; testname config ;; 1 waitons ;; 2 ~~(config-lookup config "requirements" "priority") ;; priority 3~~ (configf:lookup config "requirements" "priority") ;; priority 3 (tests:get-items config) ;; 4 ;; expand the [items] and or [itemstable] into explict items #f ;; itemsdat 5 #f ;; spare - used for item-path waitors ;; ))) ;; update waitors-upon here (for-each
︙
641 642 643 644 645 646 647 ~~648~~ 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 ~~668~~ 669 ~~670~~ 671 672 673 674 675 676 677	642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681	- + - + - + + + +	(debug:print-info 1 default-log-port "Adding \"" (string-intersperse required-tests " ") "\" to the run queue")) ;; NOTE: these are all parent tests, items are not expanded yet. (debug:print-info 4 default-log-port "test-records=" (hash-table->alist test-records)) (let ((reglen (configf:lookup configdat "setup" "runqueue"))) (if (> (length (hash-table-keys test-records)) 0) (let* ((keep-going #t) (run-queue-retries 5) ~~(th1 (make-thread (lambda ()~~ #;(th1 (make-thread (lambda () (handle-exceptions exn (begin (print-call-chain) (print " message: " ((condition-property-accessor 'exn 'message) exn))) (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))) "runs:run-tests-queue")) (th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn (debug:print 0 default-log-port "error in calling find-and-mark-incomplete for run-id " run-id) (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) ~~(thread-start! th1)~~ ;; (thread-start! th1) (thread-start! th2) ~~(thread-join! th1)~~ ;; (thread-join! th1) ;; just do the main stuff in the main thread (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry) (set! keep-going #f) (thread-join! th2) ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD (if (> run-count 0) ;; handle reruns (begin (if (not (hash-table-ref/default flags "-preclean" #f)) (hash-table-set! flags "-preclean" #t))
︙
1341 1342 1343 1344 1345 1346 1347 ~~1348~~ 1349 1350 1351 1352 1353 1354 1355	1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359	- +	(let* ((run-info (rmt:get-run-info run-id)) (tests-info (mt:get-tests-for-run run-id #f '() '())) ;; qryvals: "id,testname,item_path")) (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) (test-registry (make-hash-table)) (registry-mutex (make-mutex)) (num-retries 0) ~~(max-retries (config-lookup configdat "setup" "maxretries"))~~ (max-retries (configf:lookup configdat "setup" "maxretries")) (max-concurrent-jobs (configf:lookup-number configdat "setup" "max_concurrent_jobs" default: 50)) (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle (last-time-some-running (current-seconds)) ;; (tdbdat (tasks:open-db)) (runsdat (make-runs:dat ;; hed: hed
︙
1411 1412 1413 1414 1415 1416 1417 ~~1418 1419~~ 1420 1421 1422 1423 1424 1425 1426	1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430	- - + +	;; (rmt:find-and-mark-incomplete-all-runs) )) ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) ~~(jobgroup (config-lookup tconfig "test_meta" "jobgroup")) (testmode (let ((m (config-lookup tconfig "requirements" "mode")))~~ (jobgroup (configf:lookup tconfig "test_meta" "jobgroup")) (testmode (let ((m (configf:lookup tconfig "requirements" "mode"))) (if m (map string->symbol (string-split m)) '(normal)))) (itemmaps (tests:get-itemmaps tconfig)) ;; (configf:lookup tconfig "requirements" "itemmap")) (priority (tests:testqueue-get-priority test-record)) (itemdat (tests:testqueue-get-itemdat test-record)) ;; itemdat can be a string, list or #f (items (tests:testqueue-get-items test-record)) (item-path (item-list->path itemdat)) (tfullname (db:test-make-full-name test-name item-path))
︙
1582 1583 1584 1585 1586 1587 1588 ~~1589 1590~~ ~~1591 1592~~ 1593 1594 1595 1596 1597 1598 1599	1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600	- - + - -	(let* ((items-in-testpatt (filter (lambda (my-itemdat) (tests:match test-patts hed (item-list->path my-itemdat) )) ;; was: (tests:match test-patts hed (item-list->path my-itemdat) required: required-tests)) items) )) (if (null? items-in-testpatt) ~~(let ((test-id (rmt:get-test-id run-id test-name "")))~~ (debug:print-~~info~~ 0 default-log-port "Test " (tests:testqueue-get-testname test-record) " is itemized but has no items matching test pattern ~~-- marking status ZERO_ITEMS~~") (debug:print-error 0 default-log-port "Test " (tests:testqueue-get-testname test-record) " is itemized but has no items matching the test pattern") ~~(if test-id~~ ~~(mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" "This test has no items which match test pattern.")))~~ (for-each (lambda (my-itemdat) (let* ((new-test-record (let ((newrec (make-tests:testqueue))) (vector-copy! test-record newrec) newrec)) (my-item-path (item-list->path my-itemdat))
︙
1657 1658 1659 1660 1661 1662 1663 ~~1664~~ 1665 1666 1667 1668 1669 1670 1671	1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672	- +	(debug:print-info 4 default-log-port "cond branch - " "rtq-8") (debug:print-info 0 default-log-port "Have leftovers!") (loop (car reg)(cdr reg) '() reruns)) (else (debug:print-info 4 default-log-port "cond branch - " "rtq-9") (debug:print-info 4 default-log-port "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ~~;; this is the point where everything is launced and now you can mark the run in metadata table as all launced~~ ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now if -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0))
︙
1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895	1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908	+ + + + + + + + + + + +	;; currently running ((and skip-check (configf:lookup test-conf "skip" "prevrunning")) ;; run-ids = #f means all runs (let ((running-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED") '() #f))) (if (not (null? running-tests)) ;; have to skip (set! skip-test "Skipping due to previous tests running")))) ((and skip-check (configf:lookup test-conf "skip" "fileexists")) (if (common:file-exists? (configf:lookup test-conf "skip" "fileexists")) (set! skip-test (conc "Skipping due to existance of file " (configf:lookup test-conf "skip" "fileexists"))))) ((and skip-check (configf:lookup test-conf "skip" "filenotexists")) (if (not (common:file-exists? (configf:lookup test-conf "skip" "filenotexists"))) (set! skip-test (conc "Skipping due to non existance of file " (configf:lookup test-conf "skip" "filenotexists"))))) ((and skip-check (configf:lookup test-conf "skip" "script")) (if (= (system (configf:lookup test-conf "skip" "script")) 0) (set! skip-test (conc "Skipping due to zero return value of script " (configf:lookup test-conf "skip" "script"))))) ((and skip-check (configf:lookup test-conf "skip" "rundelay")) ;; run-ids = #f means all runs (let* ((numseconds (common:hms-string->seconds (configf:lookup test-conf "skip" "rundelay"))) (running-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED") '() #f)) (completed-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("COMPLETED" "INCOMPLETE") '("PASS" "FAIL" "ABORT") #f)) ;; ironically INCOMPLETE is same as COMPLETED in this contex (last-run-times (map db:mintest-get-event_time completed-tests)) (time-since-last (- (current-seconds) (if (null? last-run-times) 0 (common:max last-run-times)))))
︙
2539 2540 2541 2542 2543 2544 2545 ~~2546~~ 2547 2548 2549 2550 2551 2552 2553	2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566	- +	(begin (set! currrecord (make-vector 11 #f)) (rmt:testmeta-add-record test-name))) (for-each (lambda (key) (let* ((idx (cadr key)) (fld (car key)) ~~(val (config-lookup test-conf "test_meta" fld)))~~ (val (configf:lookup test-conf "test_meta" fld))) ;; (debug:print 5 default-log-port "idx: " idx " fld: " fld " val: " val) (if (and val (not (equal? (vector-ref currrecord idx) val))) (begin (print "Updating " test-name " " fld " to " val) (rmt:testmeta-update-field test-name fld val))))) '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10)))))
︙
2574 2575 2576 2577 2578 2579 2580 ~~2581~~ 2582 2583 2584 2585 2586 2587 2588	2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601	- +	(let* ((test-conf (mt:lazy-read-test-config test-name))) (if test-conf (runs:update-test_meta test-name test-conf)))) (hash-table-keys test-names)))) ;; This could probably be refactored into one complex query ... ;; NOT PORTED - DO NOT USE YET ;; ~~(define (runs:rollup-run keys runname user keyvals)~~ #;(define (runs:rollup-run keys runname user keyvals) (debug:print 4 default-log-port "runs:rollup-run, keys: " keys " -runname " runname " user: " user) (let* ((db #f) ;; register run operates on the main db (new-run-id (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour"))) (prev-tests (rmt:get-matching-previous-test-run-records new-run-id "%" "%")) (curr-tests (mt:get-tests-for-run new-run-id "%/%" '() '())) (curr-tests-hash (make-hash-table)))
︙