Overview
Comment: | CI/CD: Automated commit after successful test, build, and deploy for v1.81-preq-fail-details |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.81 |
Files: | files | file ages | folders |
SHA1: |
b7cfdb3706e7bef51bf474c5b72ee0ad |
User & Date: | fdiskadm on 2024-07-11 16:12:58 |
Other Links: | branch diff | manifest | tags |
Context
2024-07-15
| ||
09:48 | Added buttontest.scm check-in: bd1e7c8fc9 user: mrwellan tags: v1.81 | |
2024-07-14
| ||
19:39 | changed to "ready" in cicd wiki page, bump allowed load to 10 for server start. Closed-Leaf check-in: 528e8bc857 user: matt tags: v1.81-bump-server-load | |
2024-07-11
| ||
16:12 | CI/CD: Automated commit after successful test, build, and deploy for v1.81-preq-fail-details check-in: b7cfdb3706 user: fdiskadm tags: v1.81 | |
15:42 | CI/CD: Automated commit after successful test, build, and deploy for v1.81-journal-based-throttling check-in: 04d06badd2 user: fdiskadm tags: v1.81 | |
13:34 | cherry pick e8d7732e preq-fail tests in messages Leaf check-in: 2b4f2ebcce user: mmgraham tags: v1.81-preq-fail-details | |
Changes
Modified runs.scm from [adfae1025a] to [2d4118afd2].
︙ | ︙ | |||
984 985 986 987 988 989 990 | (if (list? items-list) (begin (if (null? items-list) (let ((test-id (rmt:get-test-id run-id test-name "")) (num-items (rmt:test-toplevel-num-items run-id test-name))) (if (and test-id (not (> num-items 0))) | | | 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 | (if (list? items-list) (begin (if (null? items-list) (let ((test-id (rmt:get-test-id run-id test-name "")) (num-items (rmt:test-toplevel-num-items run-id test-name))) (if (and test-id (not (> num-items 0))) (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "ZERO_ITEMS" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails)))))) (tests:testqueue-set-items! test-record items-list) (list hed tal reg reruns)) (begin (debug:print-error 0 *default-log-port* "The proc from reading the items table did not yield a list - please report this") (exit 1)))))) ((and (null? fails) |
︙ | ︙ | |||
1020 1021 1022 1023 1024 1025 1026 | (set! give-up #t))) prereqstrs)) (if (and give-up (not (and (null? tal)(null? reg)))) (let ((trimmed-tal (mt:discard-blocked-tests run-id hed tal test-records)) (trimmed-reg (mt:discard-blocked-tests run-id hed reg test-records))) | | | | | | | | | | 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 | (set! give-up #t))) prereqstrs)) (if (and give-up (not (and (null? tal)(null? reg)))) (let ((trimmed-tal (mt:discard-blocked-tests run-id hed tal test-records)) (trimmed-reg (mt:discard-blocked-tests run-id hed reg test-records))) (debug:print 1 *default-log-port* "WARNING: test " hed " has discarded prerequisites: " prereqstrs ", removing it from the queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "COMPLETED" "PREQ_DISCARDED" (conc "Failed to run due to discarded prerequisites: " prereqstrs)))) (if (and (null? trimmed-tal) (null? trimmed-reg)) #f (runs:loop-values trimmed-tal trimmed-reg reglen regfull reruns) )) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-4") (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 0 *default-log-port* "no fails in prerequisites (" (runs:pretty-string prereqs-not-met) ") for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!! ;; No runsdat, can't do this yet ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) ;; (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 *default-log-port* "no fails in prerequisites (" (runs:pretty-string prereqs-not-met) ") for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" (conc "Prerequisites (" (runs:pretty-string prereqs-not-met) ") not seen running in a while.")))) (runs:loop-values tal reg reglen regfull reruns) ))) ((and (or (not (null? fails)) (not (null? prereq-fails))) (member 'normal testmode)) (debug:print-info 4 *default-log-port* "cond branch - " "ei-5") (debug:print-info 1 *default-log-port* "test " hed " (mode=" testmode ") has failed prerequisite(s): " (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") ", removing it from to-do list") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (if (not (null? prereq-fails)) (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_DISCARDED" (conc "Failed to run due to prior failed prerequisites: "(runs:pretty-string prereq-fails))) (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails )))))) ;; (debug:print 4 *default-log-port*"BB> set PREQ_FAIL on "hed) ;; (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))) ;; BB: this works, btu equivalent for itemwait mode does not work. (if (or (not (null? reg))(not (null? tal))) (begin (hash-table-set! test-registry hed 'CANNOTRUN) (runs:loop-values tal reg reglen regfull (cons hed reruns)) ) |
︙ | ︙ | |||
1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 | (car run-limits-info))) (num-running (list-ref run-limits-info 1)) (num-running-in-jobgroup(list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) (job-group-limit (list-ref run-limits-info 4)) ;; (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) (fails (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs (runs:calc-fails prereqs-not-met) (begin (debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met) '()))) (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! (not (equal? x hed))) | > | 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 | (car run-limits-info))) (num-running (list-ref run-limits-info 1)) (num-running-in-jobgroup(list-ref run-limits-info 2)) (max-concurrent-jobs (list-ref run-limits-info 3)) (job-group-limit (list-ref run-limits-info 4)) ;; (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) (prereqs-running (runs:calc-prereqs-running prereqs-not-met)) (fails (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs (runs:calc-fails prereqs-not-met) (begin (debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met) '()))) (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! (not (equal? x hed))) |
︙ | ︙ | |||
1342 1343 1344 1345 1346 1347 1348 | ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) | | | | < | < | > | | | | | | 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 | ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 5) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (begin (let ((my-test-id (rmt:get-test-id run-id test-name item-path))) (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails)))) (if (or (not (null? reg))(not (null? tal))) (if (vector? hed) (begin (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL: " (runs:pretty-string fails)) (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" (conc "Failed to run due to failed prerequisites: " (runs:pretty-string fails)) ))) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) ;; This next is for the items (if (not (null? fails)) (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" (runs:pretty-string fails)) (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) ) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) (runs:loop-values tal reg reglen regfull reruns)) (let ((nth-try (hash-table-ref/default test-registry hed 0))) ;; hed not a vector... (debug:print 2 *default-log-port* "nth-try("hed")="nth-try) (cond ;;((member "RUNNING" (map db:test-get-state prereqs-not-met)) ((> 0 (length prereqs-running)) (if (runs:lownoise (conc "possible RUNNING prerequisites " hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites: " prereqs-running ", don't give up on it yet.")) (thread-sleep! 0.1) (runs:loop-values tal reg reglen regfull reruns)) ((or (not nth-try) ;; BB: condition on subsequent tries, condition below fires on first try (and (number? nth-try) (< nth-try 2))) (hash-table-set! test-registry hed (if (number? nth-try) (+ nth-try 1) 0)) (if (runs:lownoise (conc "not removing test " hed) 60) (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites: "(runs:pretty-string fails))) ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) (runs:loop-values newtal reg reglen regfull reruns)) ((symbol? nth-try) ;; BB: 'done matches here in one case where prereq itemwait failed. This is first "try" (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW (if (null? tal) #f ;; yes, really (list (car tal)(cdr tal) reg reruns)) (begin (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites (" (runs:pretty-string fails)") or other issue. Internal state >" nth-try "< will be overridden and we'll retry.")) (let* ((test-id (rmt:get-test-id run-id hed item-path)) (test-info (rmt:get-testinfo-state-status run-id test-id)) ;; we need *current* info (status (db:test-status test-info))) (if (equal? status "KEEP_TRYING") (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path "COMPLETED" "PREQ_FAIL" #f) (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f))) (hash-table-set! test-registry hed 'removed) ;; was 0 (if (not (and (null? reg) (null? tal))) (runs:loop-values tal reg reglen regfull reruns) #f)))) (else (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests: " (runs:pretty-string fails) " and we've tried at least 10 times to run it. Giving up now.")) ;; (debug:print 0 *default-log-port* " prereqs: " prereqs-not-met) (hash-table-set! test-registry hed 'removed) (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f) ;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug. (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "FAIL" #f) ;; treat as FAIL (list (if (null? tal)(car newtal)(car tal)) tal |
︙ | ︙ | |||
2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 | (lambda (t) (or (not (vector? t)) (and (equal? "NOT_STARTED" (db:test-get-state t)) (member (db:test-get-status t) '("n/a" "KEEP_TRYING"))) (and (equal? "RUNNING" (db:test-get-state t))))) ;; account for a test that is running prereqs-not-met)) (define (runs:pretty-string lst) (map (lambda (t) (if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) | > > > > > > > > > > > > | 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 | (lambda (t) (or (not (vector? t)) (and (equal? "NOT_STARTED" (db:test-get-state t)) (member (db:test-get-status t) '("n/a" "KEEP_TRYING"))) (and (equal? "RUNNING" (db:test-get-state t))))) ;; account for a test that is running prereqs-not-met)) (define (runs:calc-prereqs-running prereqs-not-met) (if (list? prereqs-not-met) (filter (lambda (t) (or (not (vector? t)) (member (db:test-get-state t) '("RUNNING" "LAUNCHED" "REMOTE_HOST_START")) )) prereqs-not-met) '() ) ) (define (runs:pretty-string lst) (map (lambda (t) (if (not (vector? t)) (conc t) (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t)))) lst)) |
︙ | ︙ |