Overview
Comment: | ease load on pre-launch checks if there are more than 25 jobs slots available. NOTE: Unclear if last few commits are genuinely useful |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-cleanup |
Files: | files | file ages | folders |
SHA1: |
c141e775339bab6d139ce6629cf88b84 |
User & Date: | matt on 2020-08-30 22:02:10 |
Other Links: | branch diff | manifest | tags |
Context
2020-08-30
| ||
22:12 | fixed typo ==11.11/1.5/WARN/1206/orion== check-in: 80394e6b0d user: matt tags: v1.65-cleanup | |
22:02 | ease load on pre-launch checks if there are more than 25 jobs slots available. NOTE: Unclear if last few commits are genuinely useful check-in: c141e77533 user: matt tags: v1.65-cleanup | |
17:09 | corrected typo dat-wait-for-jobs-funcion ==/18.1/1.9/WARN/1206/mars/== check-in: 64d8372f85 user: mmgraham tags: v1.65-cleanup, v1.6566 | |
Changes
Modified runs.scm from [94cebd335e] to [9c5af76de5].
︙ | ︙ | |||
44 45 46 47 48 49 50 | ;; (defstruct runs:dat reglen regfull runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry | | > > > | 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | ;; (defstruct runs:dat reglen regfull runname max-concurrent-jobs run-id test-patts required-tests test-registry registry-mutex flags keyvals run-info all-tests-registry ;; stores results from last runs:can-run-more-tests (can-run-more-tests #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit) ((can-run-more-tests-count 0) : fixnum) (last-fuel-check 0) ;; time when we last checked fuel (beginning-of-time (current-seconds)) (load-mgmt-function #f) (wait-for-jobs-function #f) (last-load-check-time 0) (last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met) |
︙ | ︙ | |||
316 317 318 319 320 321 322 | ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) | < < < < < < < < < < < < < < < | 319 320 321 322 323 324 325 326 327 328 329 330 331 332 | ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) |
︙ | ︙ | |||
1237 1238 1239 1240 1241 1242 1243 | (null? non-completed) (not (member 'exclusive testmode))))) ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) | | | 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 | (null? non-completed) (not (member 'exclusive testmode))))) ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (if (or (not (null? tal))(not (null? reg))) (runs:loop-values tal reg reglen regfull reruns) #f)) |
︙ | ︙ | |||
1678 1679 1680 1681 1682 1683 1684 | (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) ;; gonna try a strategy change here. ;; | | < | > > > > > > > > > > > > > > | | | | | | | | | | | | | | | | | | | | | | | 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 | (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) ;; gonna try a strategy change here. ;; ;; check if can run more tests. if yes, continue, if no, rest until can run more ;; ;; look at the test jobgroup and tot jobs running (if (not (runs:dat-wait-for-jobs-function runsdat)) (runs:dat-wait-for-jobs-function-set! runsdat (lambda (testdat-in) (let* ((jobgroup (runs:testdat-jobgroup testdat-in)) (can-run-more-tests (runs:dat-can-run-more-tests runsdat)) (last-jobs-check-time (runs:dat-last-jobs-check-time runsdat)) (should-check-jobs (match can-run-more-tests ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params) (if (< (- max-concurrent-jobs num-running) 25) (begin (debug:print-info 0 *default-log-port* "less than 20 jobs headroom, ("max-concurrent "-"num-running")>20. Forcing prelaunch check.") #t) #f)) (else #f)))) ;; no record yet (if should-check-jobs (let loop-can-run-more ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) (remtries 1440)) ;; we can wait for up to two hours for jobs to get done (match res ((run-more num-running . rem) (if (or run-more (< remtries 1)) (begin (if (runs:lownoise "num-running" 30) (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs)) (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through (begin (if (runs:lownoise "num-running" 10) (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of " max-concurrent-jobs " allowed.")) (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable ;; wait for load here (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) (- remtries 1))))))) ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) |
︙ | ︙ | |||
1904 1905 1906 1907 1908 1909 1910 | (if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; | | | 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 | (if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) ;; parent-test is there as a placeholder for when parent-tests can be run as a setup step ;; (define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec) ;; All these vars might be referenced by the testconfig file reader ;; ;; NEED to reprocess testconfig here, ensuring that item variables are available. ;; This is for Tal's issue with item-specific env vars not being set for use in skip. ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273 ;; (let* ((test-name (tests:testqueue-get-testname test-record)) |
︙ | ︙ | |||
2098 2099 2100 2101 2102 2103 2104 | (debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test)) ;; ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; (begin ;; wait for less than max jobs here (if (runs:dat-wait-for-jobs-function runsdat) | | | 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 | (debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test)) ;; ;; Here the test is handed off to launch.scm for launch-test to complete the launch process ;; (begin ;; wait for less than max jobs here (if (runs:dat-wait-for-jobs-function runsdat) ((runs:dat-wait-for-jobs-function runsdat) testdat-rec)) (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags)) (begin (print "ERROR: Failed to launch the test. Exiting as soon as possible") (set! *globalexitstatus* 1) ;; (process-signal (current-process-id) signal/kill)) ) |
︙ | ︙ |