︙ | | | ︙ | |
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
|
(round
(-
(current-seconds)
start-seconds)))))
(kill-tries 0))
;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
(tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
(let loop ((minutes (calc-minutes))
(cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(disk-free (get-df (current-directory)))
(last-sync (current-seconds)))
(common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
(let* ((over-time (> (current-seconds) (+ last-sync update-period)))
(new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(delta (abs (- load cpu-load))))
(if (> delta 0.1) ;; don't bother updating with small changes
load
#f)))
(new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds
|
|
|
>
>
|
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
|
(round
(-
(current-seconds)
start-seconds)))))
(kill-tries 0))
;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
(tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)
(let loop ((minutes (calc-minutes))
(cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(disk-free (get-df (current-directory)))
(last-sync (current-seconds)))
;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
;; top of loop encountered at "(current-seconds)" with
;; last-sync="last-sync))
(let* ((over-time (> (current-seconds) (+ last-sync update-period)))
(new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(delta (abs (- load cpu-load))))
(if (> delta 0.1) ;; don't bother updating with small changes
load
#f)))
(new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds
|
︙ | | | ︙ | |
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
|
(do-sync (or new-cpu-load new-disk-free over-time))
(test-info (rmt:get-test-info-by-id run-id test-id))
(state (db:test-get-state test-info))
(status (db:test-get-status test-info))
(kill-reason "no kill reason specified")
(kill-job? #f))
(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
((test-get-kill-request run-id test-id)
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
((equal? status "DEAD")
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
(rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
(set! kill-job? #f)))
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(launch:handle-zombie-tests run-id)
(when do-sync
;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
(if kill-job?
(begin
(debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
(mutex-lock! m)
;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
;; section and the runit section? Or add a loop that tries three times with a 1/4 second
;; between tries?
|
|
|
|
<
<
<
|
<
<
|
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
|
(do-sync (or new-cpu-load new-disk-free over-time))
(test-info (rmt:get-test-info-by-id run-id test-id))
(state (db:test-get-state test-info))
(status (db:test-get-status test-info))
(kill-reason "no kill reason specified")
(kill-job? #f))
#;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
((test-get-kill-request run-id test-id)
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
((equal? status "DEAD")
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
(rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
(set! kill-job? #f)))
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(launch:handle-zombie-tests run-id)
(if do-sync ;; save meta data about the running of this test
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
(if kill-job?
(begin
(debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
(mutex-lock! m)
;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
;; section and the runit section? Or add a loop that tries three times with a 1/4 second
;; between tries?
|
︙ | | | ︙ | |
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
|
(begin
(thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
(if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(loop (calc-minutes)
(or new-cpu-load cpu-load)
(or new-disk-free disk-free)
(if do-sync (current-seconds) last-sync)))))))
(tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
(define (launch:execute encoded-cmd)
(let* ((cmdinfo (common:read-encoded-string encoded-cmd))
(tconfigreg #f))
(setenv "MT_CMDINFO" encoded-cmd)
;;(bb-check-path msg: "launch:execute incoming")
|
|
|
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
|
(begin
(thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
(if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(loop (calc-minutes)
(or new-cpu-load cpu-load)
(or new-disk-free disk-free)
(if do-sync (current-seconds) last-sync)))))))
(tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional
(define (launch:execute encoded-cmd)
(let* ((cmdinfo (common:read-encoded-string encoded-cmd))
(tconfigreg #f))
(setenv "MT_CMDINFO" encoded-cmd)
;;(bb-check-path msg: "launch:execute incoming")
|
︙ | | | ︙ | |
463
464
465
466
467
468
469
470
471
472
473
474
475
476
|
(db:test-get-host test-info)
(begin
(debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
(exit))))
(test-pid (db:test-get-process_id test-info)))
(cond
;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
(debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
(rmt:general-call 'set-test-start-time #f test-id)
(rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
) ;; prime it for running
|
>
>
>
|
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
|
(db:test-get-host test-info)
(begin
(debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
(exit))))
(test-pid (db:test-get-process_id test-info)))
(cond
;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
;;((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
;; (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked
;; (member (db:test-get-status test-info) '("ABORT"))))
((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
(debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
(rmt:general-call 'set-test-start-time #f test-id)
(rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
) ;; prime it for running
|
︙ | | | ︙ | |
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
|
;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here
(define (launch:end-of-run-check run-id )
(let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))
(running-cnt (rmt:get-count-tests-running-for-run-id run-id #f)) ;; fastmode=no
(all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
(current-state (rmt:get-run-state run-id))
(current-status (rmt:get-run-status run-id)))
;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing
(debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
(rmt:set-state-status-and-roll-up-run run-id current-state current-status)
(runs:update-junit-test-reporter-xml run-id)
|
|
|
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
|
;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here
(define (launch:end-of-run-check run-id )
(let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))
(running-cnt (rmt:get-count-tests-running-for-run-id run-id))
(all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
(current-state (rmt:get-run-state run-id))
(current-status (rmt:get-run-status run-id)))
;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing
(debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
(rmt:set-state-status-and-roll-up-run run-id current-state current-status)
(runs:update-junit-test-reporter-xml run-id)
|
︙ | | | ︙ | |
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
|
(item-path (vector-ref running-test 11)))
(debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed")
(if (not (null? tal))
(loop (car tal) (cdr tal)))))))))))
(define (launch:is-test-alive host pid)
(if (and host pid (not (equal? host "n/a")))
(let* ((cmd (conc "ssh " host " pstree -A " pid))
(output (with-input-from-pipe cmd read-lines)))
(debug:print 2 *default-log-port* "Running " cmd " received " output)
(if (eq? (length output) 0)
#f
#t))
#t))
|
>
>
|
|
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
|
(item-path (vector-ref running-test 11)))
(debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed")
(if (not (null? tal))
(loop (car tal) (cdr tal)))))))))))
(define (launch:is-test-alive host pid)
(if (and host pid (not (equal? host "n/a")))
(let* ((is-local (equal? host (get-host-name)))
(ssh-cmd (if is-local " " (conc "ssh " host " ")))
(cmd (conc ssh-cmd "pstree -A " pid))
(output (with-input-from-pipe cmd read-lines)))
(debug:print 2 *default-log-port* "Running " cmd " received " output)
(if (eq? (length output) 0)
#f
#t))
#t))
|
︙ | | | ︙ | |