Overview
Comment: | Added speculative dynamically calculated delay to reduce db hammering |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65 |
Files: | files | file ages | folders |
SHA1: |
41eeb0e45fe329b8380a77cd2c27abb7 |
User & Date: | matt on 2020-10-17 00:46:02 |
Other Links: | branch diff | manifest | tags |
Context
2020-10-17
| ||
23:06 | Merged in side2 check-in: a33d7eb0be user: matt tags: v1.65 | |
14:35 | Try query rest Leaf check-in: dea5af2fba user: matt tags: v1.65-side3 | |
00:46 | Added speculative dynamically calculated delay to reduce db hammering check-in: 41eeb0e45f user: matt tags: v1.65 | |
2020-10-16
| ||
18:09 | variable propogation fix check-in: b156cc02a8 user: pjhatwal tags: v1.65 | |
Changes
Modified db.scm from [fb3a18f52f] to [055cfbeff9].
︙ | ︙ | |||
3213 3214 3215 3216 3217 3218 3219 | (if newstatus (sqlite3:execute db "UPDATE tests SET status=? WHERE id=?;" newstatus test-id)) (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment) test-id)))))) (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NEW BEHAVIOR: Count tests running in all runs! ;; | | < < | | | | | | | | | < < | | 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 | (if newstatus (sqlite3:execute db "UPDATE tests SET status=? WHERE id=?;" newstatus test-id)) (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment) test-id)))))) (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NEW BEHAVIOR: Count tests running in all runs! ;; (define (db:get-count-tests-running dbstruct run-id) (let* ((qry "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")) (db:with-db dbstruct run-id #f (lambda (db) (let* ((stmth (db:get-cache-stmth dbstruct db qry))) (sqlite3:first-result stmth)))))) ;; NEW BEHAVIOR: Count tests running in only one run! ;; (define (db:get-count-tests-actually-running dbstruct run-id) (db:with-db dbstruct run-id #f (lambda (db) (sqlite3:first-result db ;; WARNING BUG EDIT ME - merged from v1.55 - not sure what is right here ... ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id NOT IN (SELECT id FROM runs WHERE state='deleted') AND NOT (uname = 'n/a' AND item_path = '');") "SELECT count(id) FROM tests WHERE state in ('RUNNING','REMOTEHOSTSTART','LAUNCHED') AND run_id=?;" run-id)))) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');") ;; NEW BEHAVIOR: Look only at single run with run-id ;; ;; (define (db:get-running-stats dbstruct run-id) (define (db:get-count-tests-running-for-run-id dbstruct run-id) (let* ((qry "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) (db:with-db dbstruct run-id #f (lambda (db) (let* ((stmth (db:get-cache-stmth dbstruct db qry))) (sqlite3:first-result stmth run-id)))))) |
︙ | ︙ |
Modified launch.scm from [d0067277fa] to [e8093b3e63].
︙ | ︙ | |||
729 730 731 732 733 734 735 | ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) | | | 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 | ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) (current-state (rmt:get-run-state run-id)) (current-status (rmt:get-run-status run-id))) ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing (debug:print 0 *default-log-port* "Running test cnt :" running-cnt) (rmt:set-state-status-and-roll-up-run run-id current-state current-status) (runs:update-junit-test-reporter-xml run-id) |
︙ | ︙ |
Modified rmt.scm from [39d97c528a] to [342d9c7e08].
︙ | ︙ | |||
659 660 661 662 663 664 665 | (map (lambda (run-id) (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) run-ids)))) (define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) | | > | > | | | 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 | (map (lambda (run-id) (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) run-ids)))) (define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) (define (rmt:get-count-tests-running-for-run-id run-id) (if (number? run-id) (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id)) 0)) (define (rmt:get-not-completed-cnt run-id) (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) ;; Statistical queries (define (rmt:get-count-tests-running run-id) (rmt:send-receive 'get-count-tests-running run-id (list run-id))) (define (rmt:get-count-tests-running-for-testname run-id testname) (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname))) (define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup) (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup))) |
︙ | ︙ |
Modified runs.scm from [030b929939] to [24707790d1].
︙ | ︙ | |||
59 60 61 62 63 64 65 | (last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met) | | > > > > > > > > > > > > > > > > > > > > > > > > > > > | 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | (last-jobs-check-time 0) ) (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met) (module runsmod ( runs:wait-if-seen-recently ) (import scheme chicken data-structures extras files) (import posix typed-records srfi-18 srfi-69 md5 message-digest regex srfi-1) (define *last-seen-ht* (make-hash-table)) (define (runs:wait-if-seen-recently wait-until . keys) (let* ((full-key (string-intersperse keys "-")) (last-seen (hash-table-ref/default *last-seen-ht* full-key 0)) (now (current-seconds)) (delta (- now last-seen)) (needed (if (< delta wait-until) 0 (- wait-until delta)))) (if (> needed 0)(thread-sleep! needed)) (hash-table-set! *last-seen-ht* full-key (current-seconds)) needed)) ) (import runsmod) ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files ;; - remove any that are over 3600 seconds old ;; - if there are any that are younger than 10 seconds ;; * sleep 10 seconds ;; * touch my key-host-pid.softlock file ;; * return ;; - if there are no files younger than 10 seconds |
︙ | ︙ | |||
319 320 321 322 323 324 325 | ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) | | | 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 | ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) (if (> (+ num-running num-running-in-jobgroup) 0) (runs:inc-can-run-more-tests-count runsdat)) ;; (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1))) |
︙ | ︙ | |||
1563 1564 1565 1566 1567 1568 1569 | )) extras) extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) | | | 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 | )) extras) extras) '()))) (waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?)) (newtal (append tal (list hed))) (regfull (>= (length reg) reglen)) (num-running (rmt:get-count-tests-running-for-run-id run-id)) (testdat (make-runs:testdat hed: hed tal: tal reg: reg reruns: reruns test-record: test-record test-name: test-name |
︙ | ︙ | |||
1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 | ;; wait for load here (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) (- remtries 1))))))) ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running (if loop-list (apply loop loop-list)))) | > > > | 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 | ;; wait for load here (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) (- remtries 1))))))) ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed (let ((waited (runs:wait-if-seen-recently 5 "prereqs-not-met" hed item-path))) ;; if we've been down this path in the past 5 seconds - wait out the difference (if (> waited 0)(debug:print 0 *default-log-port* "Waited for prereqs-not-met-"hed"-"item-path" for " waited "seconds."))) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running (if loop-list (apply loop loop-list)))) |
︙ | ︙ | |||
1829 1830 1831 1832 1833 1834 1835 | (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle | | | | | 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 | (rmt:set-var (conc "lunch-complete-" run-id) "yes") ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) (if (> (current-seconds)(+ last-time-incomplete 900)) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id))) (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id ". Running as pid " (current-process-id) " on " (get-host-name)) (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! (rmt:find-and-mark-incomplete run-id #f) (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " (time->string (seconds->local-time (current-seconds)))))) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) (wait-loop (rmt:get-count-tests-running-for-run-id run-id) num-running)))) ;; LET* ((test-record ;; we get here on "drop through". All done! ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. ;; (debug:print-info 0 *default-log-port* "Calling Post Hook") ;; (runs:run-post-hook run-id) (debug:print-info 1 *default-log-port* "All tests launched"))) |
︙ | ︙ |