Changes In Branch v1.65-diet2 Through [f6629c6cb4] Excluding Merge-Ins
This is equivalent to a diff from 275adb0d10 to f6629c6cb4
2021-01-21
| ||
23:10 | Sketched out possible speed up with short-circuit rollup of items check-in: 9809735c40 user: matt tags: v1.65-diet2 | |
2021-01-19
| ||
12:14 | Commented out few telemetry calls check-in: f6629c6cb4 user: mrwellan tags: v1.65-diet2 | |
2021-01-18
| ||
15:29 | Fixed wrong use of optional (should have been key) check-in: 2b81179a9b user: matt tags: v1.65-diet2 | |
2021-01-17
| ||
23:34 | Start from low load node and add diet one by one check-in: f462c25d37 user: matt tags: v1.65-diet2 | |
14:29 | merged diet branch Closed-Leaf check-in: b6a618cc23 user: matt tags: v1.65-pre-69-diet | |
2020-10-03
| ||
17:58 | Bad logic on re-calc of prereq needed. The (runs:testdat-prereqs-not-met testdat) is telling you that this needs recalc as it was previously not met. Thus can bypass if *was* met previously (although why would we reach here if it was met previously?). Closed-Leaf check-in: 4e90b83f0b user: matt tags: v1.65-big-refactor (unpublished) | |
2020-09-05
| ||
21:50 | Fixed wrong use of optional that should have been key. Closed-Leaf check-in: ff41f9d1e7 user: matt tags: v1.65-prereq-qry-freq | |
13:41 | Try reduced frequency queries for prereq not met. ==/3.5/0.83/PASS/1201/mars/== check-in: 275adb0d10 user: matt tags: v1.65-prereq-qry-freq | |
11:17 | Merged cleanup branch back to v1.65 ==9.4/2.2/1201/WARN/mars== check-in: 0cbf1a0b26 user: matt tags: v1.65 | |
Modified api.scm from [4fa67bb6bd] to [280d15e219].
︙ | ︙ | |||
161 162 163 164 165 166 167 | cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) | | | 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) #;(foo (begin (common:telemetry-log (conc "api-in:"(->string cmd)) payload: `((params . ,params))) #t)) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") |
︙ | ︙ | |||
357 358 359 360 361 362 363 | ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin | | | | 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 | ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #t))) (vector #f res)) (begin #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #f))) (vector #t res)))))))) ;; http-server send-response ;; api:process-request ;; db:* |
︙ | ︙ |
Modified common.scm from [a82c407907] to [5791123b30].
︙ | ︙ | |||
3544 3545 3546 3547 3548 3549 3550 | (begin (debug:print 0 *default-log-port* "joining threads failed. exn=" exn) #t) ;; just ignore it, it might have died in the meantime so joining it will throw an exception (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) | | | | | | 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 | (begin (debug:print 0 *default-log-port* "joining threads failed. exn=" exn) #t) ;; just ignore it, it might have died in the meantime so joining it will throw an exception (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) #;(define *common:telemetry-log-state* 'startup) #;(define *common:telemetry-log-socket* #f) #;(define (common:telemetry-log-open) (if (eq? *common:telemetry-log-state* 'startup) (let* ((serverhost (configf:lookup *configdat* "telemetry" "host")) (serverport (configf:lookup-number *configdat* "telemetry" "port")) (user (or (get-environment-variable "USER") "unknown")) (host (or (get-environment-variable "HOST") "unknown"))) (set! *common:telemetry-log-state* (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure") 'broken) (if (and serverhost serverport user host) (let* ((s (udp-open-socket))) ;;(udp-bind! s #f 0) (udp-connect! s serverhost serverport) (set! *common:telemetry-log-socket* s) 'open) 'not-needed)))))) #;(define (common:telemetry-log event #!key (payload '())) (if (eq? *common:telemetry-log-state* 'startup) (common:telemetry-log-open)) (if (eq? 'open *common:telemetry-log-state*) (handle-exceptions exn (begin |
︙ | ︙ | |||
3594 3595 3596 3597 3598 3599 3600 | (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () (pp payload)))))) (msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":" toppath":"payload-serialized))) (udp-send *common:telemetry-log-socket* msg)))))) | | | 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 | (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () (pp payload)))))) (msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":" toppath":"payload-serialized))) (udp-send *common:telemetry-log-socket* msg)))))) #;(define (common:telemetry-log-close) (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*) (handle-exceptions exn (begin (define *common:telemetry-log-state* 'closed-fail) (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure") ) (begin (define *common:telemetry-log-state* 'closed) (udp-close-socket *common:telemetry-log-socket*) (set! *common:telemetry-log-socket* #f))))) |
Modified launch.scm from [d0067277fa] to [4cbc0e59d3].
︙ | ︙ | |||
209 210 211 212 213 214 215 | ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) | | | 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 | ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds |
︙ | ︙ | |||
231 232 233 234 235 236 237 | (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) | | | | | 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 | (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) #;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second |
︙ | ︙ |
Modified mt.scm from [e9055c2687] to [283ae4be89].
︙ | ︙ | |||
100 101 102 103 104 105 106 | (if last-time (< (current-seconds)(+ last-time 5)) #f)))) (if useres (let ((result (vector-ref res 1))) (debug:print 4 *default-log-port* "Using lazy value res: " result) result) | | | 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | (if last-time (< (current-seconds)(+ last-time 5)) #f)))) (if useres (let ((result (vector-ref res 1))) (debug:print 4 *default-log-port* "Using lazy value res: " result) result) (let ((newres (rmt:get-prereqs-not-met run-id waitons ref-item-path mode itemmaps))) (hash-table-set! *pre-reqs-met-cache* key (vector (current-seconds) newres)) newres)))) (define (mt:get-run-stats dbstruct run-id) ;; Get run stats from local access, move this ... but where? (db:get-run-stats dbstruct run-id)) |
︙ | ︙ |
Modified rmt.scm from [39d97c528a] to [46b655b2eb].
︙ | ︙ | |||
656 657 658 659 660 661 662 | (define (rmt:test-get-paths-matching-keynames-target-new keynames target res testpatt statepatt statuspatt runname) (let ((run-ids (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt))) (apply append (map (lambda (run-id) (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) run-ids)))) | > | | 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 | (define (rmt:test-get-paths-matching-keynames-target-new keynames target res testpatt statepatt statuspatt runname) (let ((run-ids (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt))) (apply append (map (lambda (run-id) (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) run-ids)))) ;; NOTE: rmt functions can NEVER have key params as they might be called as local (define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmaps #f)) (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) (define (rmt:get-count-tests-running-for-run-id run-id fastmode) (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id fastmode))) (define (rmt:get-not-completed-cnt run-id) (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) |
︙ | ︙ |
Modified runs.scm from [7d76065ab9] to [4de72ed2f2].
︙ | ︙ | |||
831 832 833 834 835 836 837 | ;; => review of a previously seen test is higher priority of never visited test ;; reg - list of previously visited tests ;; tal - list of never visited tests ;; prefer next hed to be from reg than tal. (define runs:nothing-left-in-queue-count 0) | | | | 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 | ;; => review of a previously seen test is higher priority of never visited test ;; reg - list of previously visited tests ;; tal - list of never visited tests ;; prefer next hed to be from reg than tal. (define runs:nothing-left-in-queue-count 0) (define (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path #!key (mode '(normal))(itemmaps #f)) ;; mode: testmode itemmaps: itemmaps) (if (and (runs:testdat-prereqs-not-met testdat) (< (- (current-seconds) (runs:testdat-last-update testdat)) 10)) ;; only refresh for this test if it has been at least 10 seconds (runs:testdat-prereqs-not-met testdat) (let* ((res (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode itemmaps))) (if (list? res) res (begin (debug:print 0 *default-log-port* "ERROR: rmt:get-prereqs-not-met returned non-list!\n" " res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" mode " itemmaps=" itemmaps) '()))))) |
︙ | ︙ | |||
860 861 862 863 864 865 866 | ;; (tal (cdr sorted-test-names)) ;; (reg '()) ;; registered, put these at the head of tal ;; (reruns '())) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps testdat) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) | < < < < < < < < < | 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 | ;; (tal (cdr sorted-test-names)) ;; (reg '()) ;; registered, put these at the head of tal ;; (reruns '())) (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps testdat) (let* ((loop-list (list hed tal reg reruns)) (prereqs-not-met (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait))))) (fails (runs:calc-fails prereqs-not-met)) (prereq-fails (runs:calc-prereq-fail prereqs-not-met)) (non-completed (runs:calc-not-completed prereqs-not-met)) (runnables (runs:calc-runnable prereqs-not-met)) (unexpanded-prereqs (filter (lambda (testname) (let* ((test-rec (hash-table-ref test-records testname)) |
︙ | ︙ | |||
1115 1116 1117 1118 1119 1120 1121 | (run-limits-info (runs:dat-can-run-more-tests runsdat)) ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (have-resources (car run-limits-info)) (num-running (list-ref run-limits-info 1)) (num-running-in-jobgroup(list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) (job-group-limit (list-ref run-limits-info 4)) | < < | 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 | (run-limits-info (runs:dat-can-run-more-tests runsdat)) ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (have-resources (car run-limits-info)) (num-running (list-ref run-limits-info 1)) (num-running-in-jobgroup(list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) (job-group-limit (list-ref run-limits-info 4)) (fails (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs (runs:calc-fails prereqs-not-met) (begin (debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met) '()))) (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! (not (equal? x hed))) |
︙ | ︙ | |||
1516 1517 1518 1519 1520 1521 1522 | registry-mutex: registry-mutex flags: flags keyvals: keyvals run-info: run-info ;; newtal: newtal all-tests-registry: all-tests-registry ;; itemmaps: itemmaps | < | 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 | registry-mutex: registry-mutex flags: flags keyvals: keyvals run-info: run-info ;; newtal: newtal all-tests-registry: all-tests-registry ;; itemmaps: itemmaps ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running ))) ;; Initialize the test-registery hash with tests that already have a record ;; convert state to symbol and use that as the hash value (for-each (lambda (trec) (let ((id (db:test-get-id trec)) |
︙ | ︙ |
Modified tests.scm from [0094b671e6] to [6adeda2896].
︙ | ︙ | |||
19 20 21 22 23 24 25 | ;;====================================================================== ;;====================================================================== ;; Tests ;;====================================================================== (declare (unit tests)) | | | 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | ;;====================================================================== ;;====================================================================== ;; Tests ;;====================================================================== (declare (unit tests)) ;;(declare (uses lock-queue)) (declare (uses db)) (declare (uses tdb)) (declare (uses common)) ;; (declare (uses dcommon)) ;; needed for the steps processing (declare (uses items)) (declare (uses runconfig)) ;; (declare (uses sdb)) |
︙ | ︙ |