538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
|
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
|
-
+
|
;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
(let ((run-ids (rmt:get-all-run-ids)))
(for-each (lambda (run-id)
(if keep-going
(handle-exceptions
exn
(debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id)
(rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime)))
(rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
run-ids)))
"runs: mark-incompletes")))
(thread-start! th1)
(thread-start! th2)
(thread-join! th1)
(set! keep-going #f)
(thread-join! th2)
|
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
|
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
|
-
+
-
+
-
+
|
#f
(list (runs:queue-next-hed trimmed-tal trimmed-reg reglen regfull)
(runs:queue-next-tal trimmed-tal trimmed-reg reglen regfull)
(runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull)
reruns)))
(list (car newtal)(append (cdr newtal) reg) '() reruns))))
((and (null? fails)
((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider
(null? prereq-fails)
(null? non-completed))
(if (runs:can-keep-running? hed 20)
(begin
(runs:inc-cant-run-tests hed)
(debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0))
(debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;;
;; getting here likely means the system is way overloaded, kill a full minute before continuing
(thread-sleep! 60)
(thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
;; num-retries code was here
;; we use this opportunity to move contents of reg to tal
(list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
(begin
(debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
(let ((test-id (rmt:get-test-id run-id hed "")))
(if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
|
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
|
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
(debug:print 5 *default-log-port* "test-records: " test-records ", flags: " (hash-table->alist flags))
;; Do mark-and-find clean up of db before starting runing of quue
;;
;; (rmt:find-and-mark-incomplete)
(let* ((run-info (rmt:get-run-info run-id))
(tests-info (mt:get-tests-for-run run-id #f '() '())) ;; qryvals: "id,testname,item_path"))
(sorted-test-names (tests:sort-by-priority-and-waiton test-records))
(test-registry (make-hash-table))
(registry-mutex (make-mutex))
(num-retries 0)
(max-retries (config-lookup *configdat* "setup" "maxretries"))
(max-concurrent-jobs (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50))
(reglen (if (number? reglen-in) reglen-in 1))
(last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle
(last-time-some-running (current-seconds))
;; (tdbdat (tasks:open-db))
(runsdat (make-runs:dat
;; hed: hed
;; tal: tal
;; reg: reg
;; reruns: reruns
reglen: reglen
regfull: #f ;; regfull
;; test-record: test-record
runname: runname
;; test-name: test-name
;; item-path: item-path
;; jobgroup: jobgroup
max-concurrent-jobs: max-concurrent-jobs
run-id: run-id
;; waitons: waitons
;; testmode: testmode
test-patts: test-patts
required-tests: required-tests
test-registry: test-registry
registry-mutex: registry-mutex
flags: flags
keyvals: keyvals
run-info: run-info
;; newtal: newtal
all-tests-registry: all-tests-registry
;; itemmaps: itemmaps
;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)
;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running
)))
(tests-info (mt:get-tests-for-run run-id #f '() '())) ;; qryvals: "id,testname,item_path"))
(sorted-test-names (tests:sort-by-priority-and-waiton test-records))
(test-registry (make-hash-table))
(registry-mutex (make-mutex))
(num-retries 0)
(max-retries (config-lookup *configdat* "setup" "maxretries"))
(max-concurrent-jobs (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50))
(reglen (if (number? reglen-in) reglen-in 1))
(last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle
(last-time-some-running (current-seconds))
;; (tdbdat (tasks:open-db))
(runsdat (make-runs:dat
;; hed: hed
;; tal: tal
;; reg: reg
;; reruns: reruns
reglen: reglen
regfull: #f ;; regfull
;; test-record: test-record
runname: runname
;; test-name: test-name
;; item-path: item-path
;; jobgroup: jobgroup
max-concurrent-jobs: max-concurrent-jobs
run-id: run-id
;; waitons: waitons
;; testmode: testmode
test-patts: test-patts
required-tests: required-tests
test-registry: test-registry
registry-mutex: registry-mutex
flags: flags
keyvals: keyvals
run-info: run-info
;; newtal: newtal
all-tests-registry: all-tests-registry
;; itemmaps: itemmaps
;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)
;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running
)))
;; Initialize the test-registery hash with tests that already have a record
;; convert state to symbol and use that as the hash value
(for-each (lambda (trec)
(let ((id (db:test-get-id trec))
(tn (db:test-get-testname trec))
(ip (db:test-get-item-path trec))
|
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
|
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
|
-
-
+
+
|
;; (server:ping (remote-server-url *runremote*)))
;; (server:check-if-running *toppath*))))
;; (server:kind-run *toppath*))
(if (> num-running 0)
(set! last-time-some-running (current-seconds)))
(if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
(hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
(if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
(hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
;; (debug:print 0 *default-log-port* "max-tries-hash: " (hash-table->alist *max-tries-hash*))
;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard
;; and it is clear they *should* have run but did not.
(if (not (hash-table-ref/default test-registry (db:test-make-full-name test-name "") #f))
(begin
(rmt:register-test run-id test-name "")
|