Overview
Comment: | Removed constant updating of test_rundat records. This should speed up big testsuite areas.==/FAIL/orion,mars/== |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-test-rundat |
Files: | files | file ages | folders |
SHA1: |
e3040653b8ee4f779500bd58cf358c3a |
User & Date: | matt on 2020-09-20 22:02:45 |
Original Comment: | Removed constant updating of test_rundat records. This should speed up big testsuite areas. |
Other Links: | branch diff | manifest | tags |
Context
2020-10-04
| ||
23:03 | Trial 5. Revert code that gates test start based on state/status. Closed-Leaf check-in: 89de0f530c user: matt tags: v1.65-cleanup-try-5 | |
2020-09-21
| ||
15:36 | merged in 1.65-test-rundat branch ==/FAIL/orion,mars/== check-in: cfd25d66e9 user: mmgraham tags: v1.6571, v1.65-failed-testdat | |
15:09 | Change the number of tests to get in each round trip to 50 from 200 ==/FAIL/orion,mars/== check-in: 1e9537e6d4 user: matt tags: v1.65-test-rundat | |
2020-09-20
| ||
22:02 | Removed constant updating of test_rundat records. This should speed up big testsuite areas.==/FAIL/orion,mars/== check-in: e3040653b8 user: matt tags: v1.65-test-rundat | |
2020-09-19
| ||
04:21 | Start moving test_rundat to no-sync db. ==/20/2/WARN/1203/mars/== check-in: abfabdb839 user: matt tags: v1.65-test-rundat | |
Changes
Modified TODO from [0ecebc45ef] to [0885dee1e5].
︙ | ︙ | |||
15 16 17 18 19 20 21 | # You should have received a copy of the GNU General Public License # along with Megatest. If not, see <http://www.gnu.org/licenses/>. TODO ==== WW38 | | > < > | 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | # You should have received a copy of the GNU General Public License # along with Megatest. If not, see <http://www.gnu.org/licenses/>. TODO ==== WW38 . Add test_rundat to no-sync ==> correction, put in <testdir>/.meta/test-run.dat . Add STATE/STATUS transitions to .meta/test-run.dat or similar . Swizzle update-test-rundat to operate on no-sync . Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync . On state/status change update tests table with duration WW15 . fill newview matrix with data, filter pipeline gui elements . improve [script], especially indent handling WW16 . split db into megatest.db (runs etc.) db/<something>.db |
︙ | ︙ |
Modified db.scm from [fb3a18f52f] to [d7c987aadc].
︙ | ︙ | |||
3465 3466 3467 3468 3469 3470 3471 | (let* ((run-ids (db:get-all-run-ids mtdb))) (for-each (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) | | | 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 | (let* ((run-ids (db:get-all-run-ids mtdb))) (for-each (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) ;; Get test data using test_id, run-id is not used - but it will be! ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct #f ;; run-id #f (lambda (db) |
︙ | ︙ |
Modified launch.scm from [d0067277fa] to [e9ff9ffdfc].
︙ | ︙ | |||
203 204 205 206 207 208 209 | (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) | | | > > | 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - ;; top of loop encountered at "(current-seconds)" with ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds |
︙ | ︙ | |||
231 232 233 234 235 236 237 | (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) | | | | < < < | < < | 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (if do-sync ;; save meta data about the running of this test (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? |
︙ | ︙ | |||
310 311 312 313 314 315 316 | (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) | | | 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 | (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) ;;(bb-check-path msg: "launch:execute incoming") |
︙ | ︙ | |||
463 464 465 466 467 468 469 | (db:test-get-host test-info) (begin (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. | | > > | 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 | (db:test-get-host test-info) (begin (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked (member (db:test-get-status test-info) '("ABORT")))) (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running ((member (db:test-get-state test-info) '("RUNNING" "REMOTEHOSTSTART")) |
︙ | ︙ |
Modified rmt.scm from [39d97c528a] to [29d7593e43].
︙ | ︙ | |||
20 21 22 23 24 25 26 27 28 29 30 31 32 33 | (use format typed-records) ;; RADT => purpose of json format?? (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") ;; (declare (uses rmtmod)) ;; (import rmtmod) ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;; | > > | 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | (use format typed-records) ;; RADT => purpose of json format?? (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") (include "db_records.scm") ;; (declare (uses rmtmod)) ;; (import rmtmod) ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;; |
︙ | ︙ | |||
523 524 525 526 527 528 529 | ;; Just some syntatic sugar (define (rmt:register-test run-id test-name item-path) (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) | | | > > > > > > > > > | 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 | ;; Just some syntatic sugar (define (rmt:register-test run-id test-name item-path) (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) ;; run-id is NOT used - but it will be! ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))) (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat"))) ;; now we can update a couple fields from the filesystem (if (and (db:test-get-rundir testdat) (file-exists? trundatf)) (let* ((duration (db:test-get-run_duration testdat)) (event-time (db:test-get-event_time testdat)) (last-touch (file-modification-time trundatf))) (db:test-set-run_duration! testdat (max duration (- last-touch event-time))))) testdat) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) |
︙ | ︙ |
Modified tests.scm from [0094b671e6] to [ef396dcaf8].
︙ | ︙ | |||
1942 1943 1944 1945 1946 1947 1948 | (lambda (count) (set! res count)) tdb "SELECT count(id) FROM test_rundat;") res)) 0) | > | > > > > > > > > > > > > > | > > | | | | | | | | < < < < < < < < < < < < < < < < < < < < < < < | 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 | (lambda (count) (set! res count)) tdb "SELECT count(id) FROM test_rundat;") res)) 0) ;; (define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)) (if (get-environment-variable "MT_TEST_RUN_DIR") (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data")) (or-dash (lambda (instr)(if instr instr "-")))) (if (not (directory-exists? dest-dir))(create-directory dest-dir #t)) (let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append))) (with-output-to-port outp (lambda () (print (current-seconds) " " (or-dash run-id) " " (or-dash test-id) " " (or-dash cpuload) " " (or-dash diskfree) " " (or-dash minutes) " " (or-dash hostname) " " (or-dash uname)))) ;; put uname last as it has spaces in it (close-output-port outp))) (begin (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)))) (if update-db (begin (if (and cpuload diskfree) (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) (if minutes (rmt:general-call 'update-run-duration run-id minutes test-id)) (if (and uname hostname) (rmt:general-call 'update-uname-host run-id uname hostname test-id))))) ;; This one is for running with no db access (i.e. via rmt: internally) (define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f)) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (uname (get-uname "-srvpio")) (hostname (get-host-name))) (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db))) ;;====================================================================== ;; A R C H I V I N G ;;====================================================================== (define (test:archive db test-id) #f) |
︙ | ︙ |