Overview
Comment: | shuffling stuff around. Currently fails to run: (rmt:get-runs-by-patt '(contour_name release iteration testsuite_mode) #f all/v1.70/1.7001/dev 0 500 #f 0 asc) but does run (db:get-runs-by-patt *db* '(contour_name release iteration testsuite_mode) % all/v1.70/1.7001/dev 0 500 #f 0 asc) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.70-refactor01 | v1.70-defunct-try |
Files: | files | file ages | folders |
SHA1: |
ba4a9f34c8f708ad1d28fc18805a4362 |
User & Date: | matt on 2019-12-11 22:14:49 |
Other Links: | branch diff | manifest | tags |
Context
2019-12-12
| ||
20:58 | Added or to line 2464 in runs-inc.scm to keep going if no runname. This bypassess the real issue where runname is being lost. Also bypassed all server/network code. check-in: 3f5da1ddca user: matt tags: v1.70-refactor01, v1.70-defunct-try | |
2019-12-11
| ||
22:14 | shuffling stuff around. Currently fails to run: (rmt:get-runs-by-patt '(contour_name release iteration testsuite_mode) #f all/v1.70/1.7001/dev 0 500 #f 0 asc) but does run (db:get-runs-by-patt *db* '(contour_name release iteration testsuite_mode) % all/v1.70/1.7001/dev 0 500 #f 0 asc) check-in: ba4a9f34c8 user: matt tags: v1.70-refactor01, v1.70-defunct-try | |
21:57 | shuffling stuff around. Currently fails to run: (rmt:get-runs-by-patt '(contour_name release iteration testsuite_mode) #f all/v1.70/1.7001/dev 0 500 #f 0 asc) check-in: 9dfe7d1083 user: matt tags: v1.70-refactor01, v1.70-defunct-try | |
Changes
Modified api-inc.scm from [7dfa5ca729] to [d2c2cccd89].
︙ | ︙ | |||
154 155 156 157 158 159 160 | cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) | | | 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) #;(foo (begin (common:telemetry-log (conc "api-in:"(->string cmd)) payload: `((params . ,params))) #t)) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") |
︙ | ︙ | |||
343 344 345 346 347 348 349 | ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin | | | | 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #t))) (vector #f res)) (begin #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #f))) (vector #t res)))))))) ;; http-server send-response ;; api:process-request ;; db:* |
︙ | ︙ |
Modified common-inc.scm from [bede31db09] to [7c8ac9f1a4].
︙ | ︙ | |||
2390 2391 2392 2393 2394 2395 2396 | (handle-exceptions exn #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | < > | | | | | | | | | | | | | | | | | | | | < > | | | | | 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 | (handle-exceptions exn #t ;; just ignore it, it might have died in the meantime so joining it will throw an exception (thread-join! thread)) ))) (hash-table-keys *common:thread-punchlist*))) ;; (define *common:telemetry-log-state* 'startup) ;; (define *common:telemetry-log-socket* #f) ;; ;; (define (common:telemetry-log-open) ;; (if (eq? *common:telemetry-log-state* 'startup) ;; (let* ((serverhost (configf:lookup *configdat* "telemetry" "host")) ;; (serverport (configf:lookup-number *configdat* "telemetry" "port")) ;; (user (or (get-environment-variable "USER") "unknown")) ;; (host (or (get-environment-variable "HOST") "unknown"))) ;; (set! *common:telemetry-log-state* ;; (handle-exceptions ;; exn ;; (begin ;; (debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure") ;; 'broken) ;; (if (and serverhost serverport user host) ;; (let* ((s (udp-open-socket))) ;; ;;(udp-bind! s #f 0) ;; (udp-connect! s serverhost serverport) ;; (set! *common:telemetry-log-socket* s) ;; 'open) ;; 'not-needed)))))) ;; ;; (define (common:telemetry-log event #!key (payload '())) ;; (if (eq? *common:telemetry-log-state* 'startup) ;; (common:telemetry-log-open)) ;; ;; (if (eq? 'open *common:telemetry-log-state*) ;; (handle-exceptions ;; exn ;; (begin ;; (debug:print-info 0 *default-log-port* "common-telemetry-log comms failure ; disabled (no server?)") ;; ;;(define *common:telemetry-log-state* 'broken-or-no-server-preclose) ;; ;;(common:telemetry-log-close) ;; (define *common:telemetry-log-state* 'broken-or-no-server) ;; (set! *common:telemetry-log-socket* #f) ;; ) ;; (if (and *common:telemetry-log-socket* event) ;; TODO - filter on event against telemetry.want-events ;; (let* ((user (or (get-environment-variable "USER") "unknown")) ;; (host (or (get-environment-variable "HOST") "unknown")) ;; (start (conc "[megatest "event"]")) ;; (toppath (or *toppath* "/dev/null")) ;; (payload-serialized ;; (base64:base64-encode ;; (z3:encode-buffer ;; (with-output-to-string (lambda () (pp payload)))))) ;; (msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":" ;; toppath":"payload-serialized))) ;; (udp-send *common:telemetry-log-socket* msg)))))) ;; ;; (define (common:telemetry-log-close) ;; (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*) ;; (handle-exceptions ;; exn ;; (begin ;; (define *common:telemetry-log-state* 'closed-fail) ;; (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure") ;; ) ;; (begin ;; (define *common:telemetry-log-state* 'closed) ;; (udp-close-socket *common:telemetry-log-socket*) ;; (set! *common:telemetry-log-socket* #f))))) |
Modified launch-inc.scm from [476c5d50a8] to [1f775f156c].
︙ | ︙ | |||
365 366 367 368 369 370 371 | ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) | | | 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 | ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) #;(common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds |
︙ | ︙ | |||
387 388 389 390 391 392 393 | (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) | | | | | 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (when do-sync ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) #;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) #;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second |
︙ | ︙ |
Modified rmt-inc.scm from [4a6d831f46] to [15a54ab90a].
︙ | ︙ | |||
70 71 72 73 74 75 76 | (client:setup areapath) #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected | | | 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | (client:setup areapath) #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected #;(common:telemetry-log (conc "rmt:"(->string cmd)) payload: `((rid . ,rid) (params . ,params))) ;; do all the prep locked under the rmt-mutex (mutex-lock! *rmt-mutex*) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote |
︙ | ︙ |
Modified runs-inc.scm from [db274d97cc] to [8c50307d5e].
︙ | ︙ | |||
364 365 366 367 368 369 370 | (debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") | | | 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 | (debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests) ));; tests will be ANDed with this list ;; register this run in monitor.db (rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params) (rmt:tasks-set-state-given-param-key task-key "running") #;(common:telemetry-log "run-tests" payload: `( (target . ,target) (run-name . ,runname) (test-patts . ,test-patts) ) ) ;; Now generate all the tests lists |
︙ | ︙ |