︙ | | | ︙ | |
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
(import (prefix base64 base64:))
(import (prefix sqlite3 sqlite3:))
(declare (unit launch))
(declare (uses subrun))
(declare (uses common))
(declare (uses configf))
(declare (uses db))
(declare (uses ezsteps))
(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "megatest-fossil-hash.scm")
;;======================================================================
;; ezsteps
;;======================================================================
;; ezsteps were going to be coded as
;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute
|
>
>
>
>
>
|
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
|
(import (prefix base64 base64:))
(import (prefix sqlite3 sqlite3:))
(declare (unit launch))
(declare (uses subrun))
(declare (uses common))
(declare (uses commonmod))
(declare (uses configf))
(declare (uses db))
(declare (uses ezsteps))
(declare (uses dbfile))
(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "megatest-fossil-hash.scm")
(import commonmod
dbfile)
;;======================================================================
;; ezsteps
;;======================================================================
;; ezsteps were going to be coded as
;; stepname[,predstep1,predstep2 ...] [{VAR1=first,second,third}] command to execute
|
︙ | | | ︙ | |
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
|
(debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))
(file-close status-file)
)
))))))
(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
(let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
(start-seconds (current-seconds))
(calc-minutes (lambda ()
(inexact->exact
(round
(-
(current-seconds)
start-seconds)))))
|
|
|
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
|
(debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep)))
(file-close status-file)
)
))))))
(define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
(let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "60")))
(start-seconds (current-seconds))
(calc-minutes (lambda ()
(inexact->exact
(round
(-
(current-seconds)
start-seconds)))))
|
︙ | | | ︙ | |
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
|
(delta (abs (- df disk-free))))
(if (and (> df 0)
(> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
df
#f)))
(do-sync (or new-cpu-load new-disk-free over-time))
(test-info (rmt:get-test-info-by-id run-id test-id))
(state (db:test-get-state test-info))
(status (db:test-get-status test-info))
(kill-reason "no kill reason specified")
(kill-job? #f))
;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
((test-get-kill-request run-id test-id)
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
((equal? status "DEAD")
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
(rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
(set! kill-job? #f)))
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(launch:handle-zombie-tests run-id)
(when do-sync
;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
)
|
|
|
|
>
|
|
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
|
(delta (abs (- df disk-free))))
(if (and (> df 0)
(> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
df
#f)))
(do-sync (or new-cpu-load new-disk-free over-time))
(test-info (rmt:get-test-state-status-by-id run-id test-id))
(state (car test-info));; (db:test-get-state test-info))
(status (cdr test-info));; (db:test-get-status test-info))
(kill-reason "no kill reason specified")
(kill-job? #f))
;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
((test-get-kill-request run-id test-id)
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
((equal? status "DEAD")
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
(rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
(set! kill-job? #f)))
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
(launch:handle-zombie-tests run-id))
(when do-sync
;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
(tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
)
|
︙ | | | ︙ | |
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
|
(tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt
(begin
(debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2)
(tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt
)))
(mutex-unlock! m)
;; no point in sticking around. Exit now. But run end of run before exiting?
(launch:end-of-run-check run-id)
(exit)))
(if (hash-table-ref/default misc-flags 'keep-going #f)
(begin
(thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
(if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(loop (calc-minutes)
(or new-cpu-load cpu-load)
|
|
|
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
|
(tests:test-set-status! run-id test-id "KILLED" "KILLED" (conc (args:get-arg "-m")" "kill-reason) #f)) ;; BB ADDED kill-reason -- confirm OK with Matt
(begin
(debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2)
(tests:test-set-status! run-id test-id "KILLED" "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt
)))
(mutex-unlock! m)
;; no point in sticking around. Exit now. But run end of run before exiting?
(launch:end-of-run-check run-id)
(exit)))
(if (hash-table-ref/default misc-flags 'keep-going #f)
(begin
(thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
(if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(loop (calc-minutes)
(or new-cpu-load cpu-load)
|
︙ | | | ︙ | |
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
|
;; if dead safe to mark the test as killed in the db
;; State/status table
;; new
;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here
(define (launch:end-of-run-check run-id )
(let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))
(running-cnt (rmt:get-count-tests-running-for-run-id run-id))
(all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
(current-state (rmt:get-run-state run-id))
(current-status (rmt:get-run-status run-id)))
;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing
(debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
(rmt:set-state-status-and-roll-up-run run-id current-state current-status)
(runs:update-junit-test-reporter-xml run-id)
(cond
((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
(if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
(begin
(debug:print 4 *default-log-port* "look for post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var (conc "end-of-run-" run-id)))
(debug:print 0 *default-log-port* "End of Run Detected.")
|
|
|
>
|
|
|
|
>
>
>
>
>
>
|
>
|
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
|
;; if dead safe to mark the test as killed in the db
;; State/status table
;; new
;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here
(define *last-rollup* 0)
(define (launch:end-of-run-check run-id )
(let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))
(running-cnt (rmt:get-count-tests-running-for-run-id run-id))
(all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
(current-state-status (rmt:get-run-state-status run-id))
(current-state (car current-state-status)) ;; (rmt:get-run-state run-id))
(current-status (cdr current-state-status))) ;; (rmt:get-run-status run-id)))
;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing
(debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
;;
;; TODO: add a final rollup when run is done (if there isn't one already)
;;
(if (or (< running-cnt 3) ;; have only few running
(> (- (current-seconds) *last-rollup*) 10)) ;; or haven't rolled up in past ten seconds
(begin
(rmt:set-state-status-and-roll-up-run run-id current-state current-status)
(set! *last-rollup* (current-seconds))))
(runs:update-junit-test-reporter-xml run-id)
(cond
((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
(if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
(begin
(debug:print 4 *default-log-port* "look for post hook. currseconds: " (current-seconds) " EOR " (rmt:get-var (conc "end-of-run-" run-id)))
(debug:print 0 *default-log-port* "End of Run Detected.")
|
︙ | | | ︙ | |
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
|
(begin
(setenv "MT_RUN_AREA_HOME" *toppath*)
(setenv "MT_TESTSUITENAME" (common:get-testsuite-name)))
(begin
(debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.")
(set! *toppath* #f) ;; force it to be false so we return #f
#f))
;; one more attempt to cache the configs for future reading
(let* ((cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig))
(mtcachef (car cachefiles))
(rccachef (cdr cachefiles)))
;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "...somepath.../.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342
;; TODO - consider 1) using simple-lock to bracket cache write
|
|
>
>
>
|
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
|
(begin
(setenv "MT_RUN_AREA_HOME" *toppath*)
(setenv "MT_TESTSUITENAME" (common:get-testsuite-name)))
(begin
(debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.")
(set! *toppath* #f) ;; force it to be false so we return #f
#f))
;; needed by various transport and db modules
(dbfile:testsuite-name (common:get-testsuite-name)) ;; (get-testsuite-name *toppath* *configdat*))
;; one more attempt to cache the configs for future reading
(let* ((cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig))
(mtcachef (car cachefiles))
(rccachef (cdr cachefiles)))
;; trap exception due to stale NFS handle -- Error: (open-output-file) cannot open file - Stale NFS file handle: "...somepath.../.runconfigs.cfg-1.6427-7d1e789cb3f62f9cde719a4865bb51b3c17ea853" - ticket 220546342
;; TODO - consider 1) using simple-lock to bracket cache write
|
︙ | | | ︙ | |
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
|
(write (list (list 'testpath test-path)
;; (list 'transport (conc *transport-type*))
;; (list 'serverinf *server-info*)
#;(list 'homehost (let* ((hhdat (server:get-homehost)))
(if hhdat
(car hhdat)
#f)))
(list 'serverurl (if *runremote*
(remote-server-url *runremote*)
#f)) ;;
(list 'areaname (common:get-testsuite-name))
(list 'toppath *toppath*)
(list 'work-area work-area)
(list 'test-name test-name)
(list 'runscript runscript)
|
|
|
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
|
(write (list (list 'testpath test-path)
;; (list 'transport (conc *transport-type*))
;; (list 'serverinf *server-info*)
#;(list 'homehost (let* ((hhdat (server:get-homehost)))
(if hhdat
(car hhdat)
#f)))
#;(list 'serverurl (if *runremote* ;; would like to add this back ... WORK NEEDED
(remote-server-url *runremote*)
#f)) ;;
(list 'areaname (common:get-testsuite-name))
(list 'toppath *toppath*)
(list 'work-area work-area)
(list 'test-name test-name)
(list 'runscript runscript)
|
︙ | | | ︙ | |