Overview
Context
Changes
Modified api-inc.scm
from [7dfa5ca729]
to [d2c2cccd89].
︙ | | |
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
-
+
|
cmd-in
(string->symbol cmd-in)))
(params (vector-ref dat 1))
(start-t (current-milliseconds))
(readonly-mode (dbr:dbstruct-read-only dbstruct))
(readonly-command (member cmd api:read-only-queries))
(writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
(foo (begin
#;(foo (begin
(common:telemetry-log (conc "api-in:"(->string cmd))
payload: `((params . ,params)))
#t))
(res
(if writecmd-in-readonly-mode
(conc "attempt to run write command "cmd" on a read-only database")
|
︙ | | |
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
|
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
|
-
+
-
+
|
;; save all stats
(let ((delta-t (- (current-milliseconds)
start-t)))
(hash-table-set! *db-api-call-time* cmd
(cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
(if writecmd-in-readonly-mode
(begin
(common:telemetry-log (conc "api-out:"(->string cmd))
#;(common:telemetry-log (conc "api-out:"(->string cmd))
payload: `((params . ,params)
(ok-res . #t)))
(vector #f res))
(begin
(common:telemetry-log (conc "api-out:"(->string cmd))
#;(common:telemetry-log (conc "api-out:"(->string cmd))
payload: `((params . ,params)
(ok-res . #f)))
(vector #t res))))))))
;; http-server send-response
;; api:process-request
;; db:*
|
︙ | | |
Modified common-inc.scm
from [b0ac284799]
to [9909c0819b].
︙ | | |
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
|
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
|
(handle-exceptions
exn
#t ;; just ignore it, it might have died in the meantime so joining it will throw an exception
(thread-join! thread))
)))
(hash-table-keys *common:thread-punchlist*)))
(define *common:telemetry-log-state* 'startup)
(define *common:telemetry-log-socket* #f)
(define (common:telemetry-log-open)
(if (eq? *common:telemetry-log-state* 'startup)
(let* ((serverhost (configf:lookup *configdat* "telemetry" "host"))
(serverport (configf:lookup-number *configdat* "telemetry" "port"))
(user (or (get-environment-variable "USER") "unknown"))
(host (or (get-environment-variable "HOST") "unknown")))
(set! *common:telemetry-log-state*
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure")
'broken)
(if (and serverhost serverport user host)
(let* ((s (udp-open-socket)))
;;(udp-bind! s #f 0)
(udp-connect! s serverhost serverport)
(set! *common:telemetry-log-socket* s)
'open)
'not-needed))))))
(define (common:telemetry-log event #!key (payload '()))
(if (eq? *common:telemetry-log-state* 'startup)
(common:telemetry-log-open))
(if (eq? 'open *common:telemetry-log-state*)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "common-telemetry-log comms failure ; disabled (no server?)")
;;(define *common:telemetry-log-state* 'broken-or-no-server-preclose)
;;(common:telemetry-log-close)
(define *common:telemetry-log-state* 'broken-or-no-server)
(set! *common:telemetry-log-socket* #f)
;; (define *common:telemetry-log-state* 'startup)
;; (define *common:telemetry-log-socket* #f)
;;
;; (define (common:telemetry-log-open)
;; (if (eq? *common:telemetry-log-state* 'startup)
;; (let* ((serverhost (configf:lookup *configdat* "telemetry" "host"))
;; (serverport (configf:lookup-number *configdat* "telemetry" "port"))
;; (user (or (get-environment-variable "USER") "unknown"))
;; (host (or (get-environment-variable "HOST") "unknown")))
;; (set! *common:telemetry-log-state*
;; (handle-exceptions
;; exn
;; (begin
;; (debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure")
;; 'broken)
;; (if (and serverhost serverport user host)
;; (let* ((s (udp-open-socket)))
;; ;;(udp-bind! s #f 0)
;; (udp-connect! s serverhost serverport)
;; (set! *common:telemetry-log-socket* s)
;; 'open)
;; 'not-needed))))))
;;
;; (define (common:telemetry-log event #!key (payload '()))
;; (if (eq? *common:telemetry-log-state* 'startup)
;; (common:telemetry-log-open))
;;
;; (if (eq? 'open *common:telemetry-log-state*)
;; (handle-exceptions
;; exn
;; (begin
;; (debug:print-info 0 *default-log-port* "common-telemetry-log comms failure ; disabled (no server?)")
;; ;;(define *common:telemetry-log-state* 'broken-or-no-server-preclose)
;; ;;(common:telemetry-log-close)
;; (define *common:telemetry-log-state* 'broken-or-no-server)
;; (set! *common:telemetry-log-socket* #f)
)
(if (and *common:telemetry-log-socket* event) ;; TODO - filter on event against telemetry.want-events
(let* ((user (or (get-environment-variable "USER") "unknown"))
(host (or (get-environment-variable "HOST") "unknown"))
(start (conc "[megatest "event"]"))
(toppath (or *toppath* "/dev/null"))
(payload-serialized
(base64:base64-encode
(z3:encode-buffer
(with-output-to-string (lambda () (pp payload))))))
(msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":"
toppath":"payload-serialized)))
(udp-send *common:telemetry-log-socket* msg))))))
(define (common:telemetry-log-close)
(when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*)
(handle-exceptions
exn
(begin
(define *common:telemetry-log-state* 'closed-fail)
(debug:print-info 0 *default-log-port* "common-telemetry-log closure failure")
;; )
;; (if (and *common:telemetry-log-socket* event) ;; TODO - filter on event against telemetry.want-events
;; (let* ((user (or (get-environment-variable "USER") "unknown"))
;; (host (or (get-environment-variable "HOST") "unknown"))
;; (start (conc "[megatest "event"]"))
;; (toppath (or *toppath* "/dev/null"))
;; (payload-serialized
;; (base64:base64-encode
;; (z3:encode-buffer
;; (with-output-to-string (lambda () (pp payload))))))
;; (msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":"
;; toppath":"payload-serialized)))
;; (udp-send *common:telemetry-log-socket* msg))))))
;;
;; (define (common:telemetry-log-close)
;; (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*)
;; (handle-exceptions
;; exn
;; (begin
;; (define *common:telemetry-log-state* 'closed-fail)
;; (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure")
)
(begin
(define *common:telemetry-log-state* 'closed)
(udp-close-socket *common:telemetry-log-socket*)
(set! *common:telemetry-log-socket* #f)))))
;; )
;; (begin
;; (define *common:telemetry-log-state* 'closed)
;; (udp-close-socket *common:telemetry-log-socket*)
;; (set! *common:telemetry-log-socket* #f)))))
|
Modified launch-inc.scm
from [476c5d50a8]
to [1f775f156c].
︙ | | |
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
|
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
|
-
+
|
;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
(tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
(let loop ((minutes (calc-minutes))
(cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(disk-free (get-df (current-directory)))
(last-sync (current-seconds)))
(common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
#;(common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
(let* ((over-time (> (current-seconds) (+ last-sync update-period)))
(new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(delta (abs (- load cpu-load))))
(if (> delta 0.1) ;; don't bother updating with small changes
load
#f)))
(new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds
|
︙ | | |
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
|
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
|
-
+
-
+
-
+
|
(do-sync (or new-cpu-load new-disk-free over-time))
(test-info (rmt:get-test-info-by-id run-id test-id))
(state (db:test-get-state test-info))
(status (db:test-get-status test-info))
(kill-reason "no kill reason specified")
(kill-job? #f))
(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
#;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
((test-get-kill-request run-id test-id)
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
((equal? status "DEAD")
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
(rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
(set! kill-job? #f)))
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(launch:handle-zombie-tests run-id)
(when do-sync
;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
#;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
#;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
(if kill-job?
(begin
(debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
(mutex-lock! m)
;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
;; section and the runit section? Or add a loop that tries three times with a 1/4 second
|
︙ | | |
Modified rmt-inc.scm
from [4a6d831f46]
to [15a54ab90a].
︙ | | |
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
|
-
+
|
(client:setup areapath)
#f))))
(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
(common:telemetry-log (conc "rmt:"(->string cmd))
#;(common:telemetry-log (conc "rmt:"(->string cmd))
payload: `((rid . ,rid)
(params . ,params)))
;; do all the prep locked under the rmt-mutex
(mutex-lock! *rmt-mutex*)
;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
|
︙ | | |
Modified runs-inc.scm
from [cb209f098e]
to [96438abf0b].
︙ | | |
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
|
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
|
-
+
|
(debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests)
));; tests will be ANDed with this list
;; register this run in monitor.db
(rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
(rmt:tasks-set-state-given-param-key task-key "running")
(common:telemetry-log "run-tests"
#;(common:telemetry-log "run-tests"
payload:
`( (target . ,target)
(run-name . ,runname)
(test-patts . ,test-patts) ) )
;; Now generate all the tests lists
|
︙ | | |