Changes In Branch v1.80-dbperformance Excluding Merge-Ins
This is equivalent to a diff from 19861e6399 to 7b86032d25
2023-02-10
| ||
21:06 | Merging pretty good branch v1.80-dbperf to v1.80 check-in: 7170e5f43b user: matt tags: v1.80 | |
20:19 | Use debugprint module in dbfile module as stepping stone to replacing old debug:print calls with new. Closed-Leaf check-in: 0e8fa15f1d user: matt tags: v1.80-debugprint | |
2023-02-06
| ||
19:35 | Squashed v1.80-dbperformance into one commit check-in: 6c7b8be468 user: matt tags: v1.80-dbperf | |
2023-02-05
| ||
11:47 | Minor clean up. There were a couple communication errors in sixtyfivek but they looked likely to be host related. Closed-Leaf check-in: e973b1fb77 user: matt tags: v1.80-cleanup | |
08:36 | wip, close idle db connections check-in: 97a3c4ad11 user: matt tags: v1.80-close-idle-connections | |
2023-02-04
| ||
20:00 | Change default number of db's to 10, couple minor fixes. Closed-Leaf check-in: 7b86032d25 user: matt tags: v1.80-dbperformance | |
2023-02-03
| ||
21:51 | Reduce zombie cleanup, not clear it helps check-in: c8ec459e79 user: matt tags: v1.80-dbperformance | |
02:16 | Reduce load from get-state-status-and-roll-up-run. check-in: 4e634eb46a user: matt tags: v1.80-dbperformance | |
2023-02-02
| ||
12:54 | Use an actual droop check-in: 19861e6399 user: matt tags: v1.80 | |
09:27 | Change couple queries to use prepared statements check-in: a2e41e0613 user: matt tags: v1.80 | |
Modified api.scm from [0676a2f9d1] to [f3cc459c57].
︙ | ︙ | |||
37 38 39 40 41 42 43 44 45 46 47 48 49 50 | (define api:read-only-queries '(get-key-val-pairs get-var get-keys get-key-vals test-toplevel-num-items get-test-info-by-id get-steps-info-by-id get-data-info-by-id test-get-rundir-from-test-id get-count-tests-running-for-testname get-count-tests-running get-count-tests-running-in-jobgroup get-previous-test-run-record | > | 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | (define api:read-only-queries '(get-key-val-pairs get-var get-keys get-key-vals test-toplevel-num-items get-test-info-by-id get-test-state-status-by-id get-steps-info-by-id get-data-info-by-id test-get-rundir-from-test-id get-count-tests-running-for-testname get-count-tests-running get-count-tests-running-in-jobgroup get-previous-test-run-record |
︙ | ︙ | |||
326 327 328 329 330 331 332 333 334 335 336 337 338 339 | ;; ARCHIVES ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) ;; TESTS ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params)) ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) ;; ((get-previous-test-run-record) (apply db:get-previous-test-run-record dbstruct params)) ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params)) | > | 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 | ;; ARCHIVES ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) ;; TESTS ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) ((get-test-state-status-by-id) (apply db:get-test-state-status-by-id dbstruct params)) ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params)) ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) ;; ((get-previous-test-run-record) (apply db:get-previous-test-run-record dbstruct params)) ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params)) |
︙ | ︙ | |||
349 350 351 352 353 354 355 356 357 358 359 360 361 362 | ((get-raw-run-stats) (apply db:get-raw-run-stats dbstruct params)) ((get-test-times) (apply db:get-test-times dbstruct params)) ;; RUNS ((get-run-info) (apply db:get-run-info dbstruct params)) ((get-run-status) (apply db:get-run-status dbstruct params)) ((get-run-state) (apply db:get-run-state dbstruct params)) ((set-run-status) (apply db:set-run-status dbstruct params)) ((set-run-state-status) (apply db:set-run-state-status dbstruct params)) ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db dbstruct params)) ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params)) ((get-test-id) (apply db:get-test-id dbstruct params)) ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) | > | 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 | ((get-raw-run-stats) (apply db:get-raw-run-stats dbstruct params)) ((get-test-times) (apply db:get-test-times dbstruct params)) ;; RUNS ((get-run-info) (apply db:get-run-info dbstruct params)) ((get-run-status) (apply db:get-run-status dbstruct params)) ((get-run-state) (apply db:get-run-state dbstruct params)) ((get-run-state-status) (apply db:get-run-state-status dbstruct params)) ((set-run-status) (apply db:set-run-status dbstruct params)) ((set-run-state-status) (apply db:set-run-state-status dbstruct params)) ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db dbstruct params)) ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params)) ((get-test-id) (apply db:get-test-id dbstruct params)) ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) |
︙ | ︙ |
Modified db.scm from [66cca5d3c4] to [ffd8a763fc].
︙ | ︙ | |||
466 467 468 469 470 471 472 | fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp)) (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db) (hash-table-set! sync-durations (conc fname".db") (- (current-milliseconds) start-time))) (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date") ))) dbfiles) | > | > | 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 | fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp)) (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db) (hash-table-set! sync-durations (conc fname".db") (- (current-milliseconds) start-time))) (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date") ))) dbfiles) ;; WHY does the dbdat need to be added back? (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)) ) #t) ;; options: ;; ;; 'killservers - kills all servers ;; 'dejunk - removes junk records ;; 'adj-testids - move test-ids into correct ranges |
︙ | ︙ | |||
596 597 598 599 600 601 602 603 604 605 606 607 608 609 | (lambda (subdb) (let* ((dbname (db:run-id->dbname run-id)) (mtdb (dbr:subdb-mtdb subdb)) (tmpdb (db:get-subdb dbstruct run-id)) (refndb (dbr:subdb-refndb subdb)) (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb))) ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb) (dbfile:add-dbdat dbstruct run-id tmpdb) (set! res (cons newres res)))) subdbs) res)) ;;;; run-ids ;; if #f use *db-local-sync* : or 'local-sync-flags | > | 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 | (lambda (subdb) (let* ((dbname (db:run-id->dbname run-id)) (mtdb (dbr:subdb-mtdb subdb)) (tmpdb (db:get-subdb dbstruct run-id)) (refndb (dbr:subdb-refndb subdb)) (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb))) ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb) ;; BUG: verify this is really needed (dbfile:add-dbdat dbstruct run-id tmpdb) (set! res (cons newres res)))) subdbs) res)) ;;;; run-ids ;; if #f use *db-local-sync* : or 'local-sync-flags |
︙ | ︙ | |||
902 903 904 905 906 907 908 909 910 911 912 913 914 915 | db (conc "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND last_df > ?;") dneeded)) (dbfile:add-dbdat dbstruct #f dbdat) blocks)) ;; returns id of the record, register a disk allocated to archiving and record it's last known ;; available space ;; (define (db:archive-register-disk dbstruct bdisk-name bdisk-path df) | > | 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 | db (conc "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND last_df > ?;") dneeded)) ;; BUG: Verfify this is really needed (dbfile:add-dbdat dbstruct #f dbdat) blocks)) ;; returns id of the record, register a disk allocated to archiving and record it's last known ;; available space ;; (define (db:archive-register-disk dbstruct bdisk-name bdisk-path df) |
︙ | ︙ | |||
2064 2065 2066 2067 2068 2069 2070 | (let ((res "n/a")) (db:with-db dbstruct #f #f (lambda (dbdat db) (sqlite3:for-each-row (lambda (status) (set! res status)) | > | | > | | > > > > > > > > > > > > > > | 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 | (let ((res "n/a")) (db:with-db dbstruct #f #f (lambda (dbdat db) (sqlite3:for-each-row (lambda (status) (set! res status)) (db:get-cache-stmth dbdat db "SELECT status FROM runs WHERE id=?;" ) run-id) res)))) (define (db:get-run-state dbstruct run-id) (let ((res "n/a")) (db:with-db dbstruct #f #f (lambda (dbdat db) (sqlite3:for-each-row (lambda (status) (set! res status)) (db:get-cache-stmth dbdat db "SELECT state FROM runs WHERE id=?;" ) run-id) res)))) (define (db:get-run-state-status dbstruct run-id) (let ((res (cons "n/a" "n/a"))) (db:with-db dbstruct #f #f (lambda (dbdat db) (sqlite3:for-each-row (lambda (state status) (set! res (cons state status))) (db:get-cache-stmth dbdat db "SELECT state,status FROM runs WHERE id=?;" ) run-id) res)))) ;;====================================================================== ;; K E Y S ;;====================================================================== |
︙ | ︙ | |||
2694 2695 2696 2697 2698 2699 2700 | (let* ((run-ids (db:get-all-run-ids mtdb))) (for-each (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs))) run-ids))) | | > > > > > > > > > > > > > > > > | 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 | (let* ((run-ids (db:get-all-run-ids mtdb))) (for-each (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs))) run-ids))) ;; Get test data using test_id ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (dbdat db) (let ((res #f)) (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test (lambda (id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update) ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update))) (db:get-cache-stmth dbdat db (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;")) test-id) res)))) ;; Get test state, status using test_id ;; (define (db:get-test-state-status-by-id dbstruct run-id test-id) (db:with-db dbstruct run-id #f (lambda (dbdat db) (let ((res (cons #f #f))) (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test (lambda (state status) (cons state status)) (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;") test-id) res)))) ;; Use db:test-get* to access ;; Get test data using test_ids. NB// Only works within a single run!! ;; (define (db:get-test-info-by-ids dbstruct run-id test-ids) (db:with-db dbstruct |
︙ | ︙ |
Modified dbfile.scm from [25f8271ef2] to [2ad26c948f].
︙ | ︙ | |||
35 36 37 38 39 40 41 42 43 | posix typed-records srfi-18 srfi-1 srfi-69 stack files ports commonmod ) | > < > > | 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | posix typed-records srfi-18 srfi-1 srfi-69 stack files ports commonmod ;; debugprint ) (define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic (define num-run-dbs (make-parameter 10)) ;; number of db's in .megatest ;;====================================================================== ;; R E C O R D S ;;====================================================================== ;; a single Megatest area with it's multiple dbs is ;; managed in a dbstruct |
︙ | ︙ | |||
189 190 191 192 193 194 195 196 197 198 | ;; (abandoned the idea of num/db) ;; (define (dbfile:run-id->path apath run-id) (conc apath"/"(dbfile:run-id->dbname run-id))) (define (db:dbname->path apath dbname) (conc apath"/"dbname)) ;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number (define (dbfile:run-id->dbname run-id) | > > > > > > > < < | < | 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | ;; (abandoned the idea of num/db) ;; (define (dbfile:run-id->path apath run-id) (conc apath"/"(dbfile:run-id->dbname run-id))) (define (db:dbname->path apath dbname) (conc apath"/"dbname)) (define (dbfile:run-id->dbnum run-id) (cond ((number? run-id) (modulo run-id (num-run-dbs))) ((not run-id) "main") ;; 0 or main? (else run-id))) ;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number (define (dbfile:run-id->dbname run-id) (conc ".megatest/"(dbfile:run-id->dbnum run-id)".db")) ;; Make the dbstruct, setup up auxillary db's and call for main db at least once ;; ;; called in http-transport and replicated in rmt.scm for *local* access. ;; (define (dbfile:setup do-sync areapath tmppath) (cond |
︙ | ︙ | |||
239 240 241 242 243 244 245 | (if (stack-empty? (dbr:subdb-dbstack subdb)) #f (begin (stack-pop! (dbr:subdb-dbstack subdb)))))) ;; return a previously opened db handle to the stack of available handles (define (dbfile:add-dbdat dbstruct run-id dbdat) | | | > > > > | 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | (if (stack-empty? (dbr:subdb-dbstack subdb)) #f (begin (stack-pop! (dbr:subdb-dbstack subdb)))))) ;; return a previously opened db handle to the stack of available handles (define (dbfile:add-dbdat dbstruct run-id dbdat) (let* ((subdb (dbfile:get-subdb dbstruct run-id)) (dbstk (dbr:subdb-dbstack subdb)) (count (stack-count dbstk))) (if (> count 15) (dbfile:print-err "WARNING: stack for "run-id".db is "count".")) (stack-push! dbstk dbdat) dbdat)) ;; set up a subdb ;; (define (dbfile:init-subdb dbstruct run-id init-proc) (let* ((dbname (dbfile:run-id->dbname run-id)) (areapath (dbr:dbstruct-areapath dbstruct)) |
︙ | ︙ | |||
884 885 886 887 888 889 890 | (append (list todb) slave-dbs) ) ) ) tbls) (let* ((runtime (- (current-milliseconds) start-time)) (should-print (or ;; (debug:debug-mode 12) | | > | 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 | (append (list todb) slave-dbs) ) ) ) tbls) (let* ((runtime (- (current-milliseconds) start-time)) (should-print (or ;; (debug:debug-mode 12) (common:low-noise-print 120 "db sync") (> runtime 500)))) ;; low and high sync times treated as separate. (for-each (lambda (dat) (let ((tblname (car dat)) (count (cdr dat))) (set! tot-count (+ tot-count count)) )) (sort (hash-table->alist numrecs)(lambda (a b)(> (cdr a)(cdr b)))))) |
︙ | ︙ | |||
1002 1003 1004 1005 1006 1007 1008 | (let* ((dbdat (dbfile:open-db dbstruct run-id dbinit))) (set! *db-write-access* (not (dbr:dbdat-read-only dbdat))) ;; (mutex-unlock! *db-open-mutex*) dbdat)) (define dbfile:db-init-proc (make-parameter #f)) | > > > > | > | > | | | < | | | | | | | | | | | | | | | | | | < < > | < < < < | 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 | (let* ((dbdat (dbfile:open-db dbstruct run-id dbinit))) (set! *db-write-access* (not (dbr:dbdat-read-only dbdat))) ;; (mutex-unlock! *db-open-mutex*) dbdat)) (define dbfile:db-init-proc (make-parameter #f)) ;; in xmaxima this gives a curve close to what I want: ;; plot2d ((exp(x/1.2)-1)/300, [x, 0, 10])$ ;; plot2d ((exp(x/1.5)-1)/40, [x, 0, 10])$ ;; plot2d ((exp(x/5)-1)/40, [x, 0, 20])$ (define (dbfile:droop x) (/ (- (exp (/ x 5)) 1) 40)) ;; (* numqrys (/ 1 (qif-slope)))) ;; create a dropping near the db file in a qif dir ;; use count of such files to gate queries (queries in flight) ;; (define (dbfile:wait-for-qif fname run-id params) (let* ((thedir (pathname-directory fname)) (dbnum (dbfile:run-id->dbnum run-id)) (destdir (conc thedir"/qif-"dbnum)) (uniqn (get-area-path-signature (conc dbnum params))) (crumbn (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id)))) (if (not (file-exists? destdir))(create-directory (conc destdir"/attic") #t)) (let loop ((count 0)) (let* ((currlks (glob (conc destdir"/*"))) (numqrys (length currlks)) (delayval (cond ;; do a droopish curve ((> numqrys 25) (for-each (lambda (f) (if (> (- (current-seconds) (handle-exceptions exn (current-seconds) ;; file is likely gone, just fake out (file-modification-time f))) (keep-age-param)) (let* ((basedir (pathname-directory f)) (filen (pathname-file f)) (destf (conc basedir"/attic/"filen))) (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf) ;; (delete-file* f) (handle-exceptions exn #t (file-move f destf #t))))) currlks) 4) ((> numqrys 0) (dbfile:droop numqrys)) ;; slope of 1/100 (else #f)))) (if (and delayval (< count 5)) (begin (thread-sleep! delayval) (loop (+ count 1)))))) (with-output-to-file crumbn |
︙ | ︙ |
Modified launch.scm from [34af87f4bd] to [a705b2e0b2].
︙ | ︙ | |||
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | (import (prefix base64 base64:)) (import (prefix sqlite3 sqlite3:)) (declare (unit launch)) (declare (uses subrun)) (declare (uses common)) (declare (uses configf)) (declare (uses db)) (declare (uses ezsteps)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "megatest-fossil-hash.scm") ;;====================================================================== ;; ezsteps ;;====================================================================== ;; ezsteps were going to be coded as ;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute | > > > | 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | (import (prefix base64 base64:)) (import (prefix sqlite3 sqlite3:)) (declare (unit launch)) (declare (uses subrun)) (declare (uses common)) (declare (uses commonmod)) (declare (uses configf)) (declare (uses db)) (declare (uses ezsteps)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") (include "megatest-fossil-hash.scm") (import commonmod) ;;====================================================================== ;; ezsteps ;;====================================================================== ;; ezsteps were going to be coded as ;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute |
︙ | ︙ | |||
203 204 205 206 207 208 209 | (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))) (file-close status-file) ) )))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) | | | 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))) (file-close status-file) ) )))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "60"))) (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) start-seconds))))) |
︙ | ︙ | |||
237 238 239 240 241 242 243 | (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) (do-sync (or new-cpu-load new-disk-free over-time)) | | | | > | | 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | (delta (abs (- df disk-free)))) (if (and (> df 0) (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg df #f))) (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-state-status-by-id run-id test-id)) (state (car test-info));; (db:test-get-state test-info)) (status (cdr test-info));; (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty (launch:handle-zombie-tests run-id)) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))) ) |
︙ | ︙ | |||
763 764 765 766 767 768 769 | ;; if dead safe to mark the test as killed in the db ;; State/status table ;; new ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here | | | > | | | | > > > > > > | > | 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 | ;; if dead safe to mark the test as killed in the db ;; State/status table ;; new ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED ;; 0 RUNNING ==> this is actually the first condition, should not get here (define *last-rollup* 0) (define (launch:end-of-run-check run-id ) (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id)) (running-cnt (rmt:get-count-tests-running-for-run-id run-id)) (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id))) (current-state-status (rmt:get-run-state-status run-id)) (current-state (car current-state-status)) ;; (rmt:get-run-state run-id)) (current-status (cdr current-state-status))) ;; (rmt:get-run-status run-id))) ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing (debug:print 0 *default-log-port* "Running test cnt :" running-cnt) ;; ;; TODO: add a final rollup when run is done (if there isn't one already) ;; (if (or (< running-cnt 3) ;; have only few running (> (- (current-seconds) *last-rollup*) 10)) ;; or haven't rolled up in past ten seconds (begin (rmt:set-state-status-and-roll-up-run run-id current-state current-status) (set! *last-rollup* (current-seconds)))) (runs:update-junit-test-reporter-xml run-id) (cond ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" )) (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id))) (begin (debug:print 4 *default-log-port* "look for post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var (conc "end-of-run-" run-id))) (debug:print 0 *default-log-port* "End of Run Detected.") |
︙ | ︙ |
Modified rmt.scm from [771d7d8ec4] to [22216f2b37].
︙ | ︙ | |||
535 536 537 538 539 540 541 | (assert (number? run-id) "FATAL: Run id required.") (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) | < < > > > | 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 | (assert (number? run-id) "FATAL: Run id required.") (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) (define (rmt:get-test-state-status-by-id run-id test-id) (rmt:send-receive 'get-test-state-status-by-id run-id (list run-id test-id))) (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) (define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f)) (assert (number? run-id) "FATAL: Run id required.") (let* ((test-path (if (string? work-area) work-area |
︙ | ︙ | |||
797 798 799 800 801 802 803 804 805 806 807 808 809 810 | (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-run-status #f (list run-id))) (define (rmt:get-run-state run-id) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-run-state #f (list run-id))) (define (rmt:set-run-status run-id run-status #!key (msg #f)) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) (define (rmt:set-run-state-status run-id state status ) (assert (number? run-id) "FATAL: Run id required.") | > > > | 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 | (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-run-status #f (list run-id))) (define (rmt:get-run-state run-id) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-run-state #f (list run-id))) (define (rmt:get-run-state-status run-id) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'get-run-state-status #f (list run-id))) (define (rmt:set-run-status run-id run-status #!key (msg #f)) (assert (number? run-id) "FATAL: Run id required.") (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) (define (rmt:set-run-state-status run-id state status ) (assert (number? run-id) "FATAL: Run id required.") |
︙ | ︙ |
Modified runs.scm from [dd48ee309d] to [e0e242ce63].
︙ | ︙ | |||
1861 1862 1863 1864 1865 1866 1867 | (my-item-path (item-list->path my-itemdat)) (newtestname (db:test-make-full-name hed my-item-path))) ;; test names are unique on testname/item-path (tests:testqueue-set-items! new-test-record #f) (tests:testqueue-set-itemdat! new-test-record my-itemdat) (tests:testqueue-set-item_path! new-test-record my-item-path) (hash-table-set! test-records newtestname new-test-record) | > | | 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 | (my-item-path (item-list->path my-itemdat)) (newtestname (db:test-make-full-name hed my-item-path))) ;; test names are unique on testname/item-path (tests:testqueue-set-items! new-test-record #f) (tests:testqueue-set-itemdat! new-test-record my-itemdat) (tests:testqueue-set-item_path! new-test-record my-item-path) (hash-table-set! test-records newtestname new-test-record) ;; BUG: This next line sucks up a lot of horsepower (set! tal (append tal (list newtestname))))) ;; since these are itemized create new test names testname/itempath items-in-testpatt))) ;; At this point we have possibly added items to tal but all must be handed off to ;; INNER COND logic. I think loop without rotating the queue ;; (loop hed tal reg reruns)) |
︙ | ︙ |
Modified tests.scm from [5c2006972a] to [b17b9fa8b7].
︙ | ︙ | |||
1964 1965 1966 1967 1968 1969 1970 | ;;====================================================================== ;; test steps ;;====================================================================== ;; teststep-set-status! used to be here (define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) | | | | 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 | ;;====================================================================== ;; test steps ;;====================================================================== ;; teststep-set-status! used to be here (define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) (let* ((testdat (rmt:get-test-state-status-by-id run-id test-id))) (and testdat (equal? (car testdat) "KILLREQ")))) (define (test:tdb-get-rundat-count tdb) (if tdb (let ((res 0)) (sqlite3:for-each-row (lambda (count) (set! res count)) |
︙ | ︙ |