Changes In Branch v1.64-runs-deepdive Excluding Merge-Ins
This is equivalent to a diff from 536b980f6a to b6d97c539d
2017-09-14
| ||
17:10 | fixed issue where item gets stuck in keep_waiting status when prerequisite item failed check-in: 23745b4302 user: bjbarcla tags: v1.6431, v1.64-keep-running-fix | |
15:31 | keep-running issue addressed Leaf check-in: b6d97c539d user: bjbarcla tags: v1.64-runs-deepdive | |
2017-09-13
| ||
22:00 | reverted use of dbstruct to db as it is inside a transaction, must access the db directly and not invoke all the magically machinery that will dynamically open up new database handles. check-in: fbc6ca10e6 user: matt tags: v1.64-runs-deepdive | |
2017-08-22
| ||
23:38 | Added re-copy of test spec on test start to resist issues with NFS. check-in: 92e6ddd6a2 user: matt tags: v1.64 | |
17:23 | made some comments during code review session check-in: 3b63acda45 user: bjbarcla tags: v1.64-runs-deepdive | |
2017-08-16
| ||
15:23 | factored out common code in show-uncalled-procedures.scm and trackback.scm check-in: 536b980f6a user: bjbarcla tags: v1.64 | |
11:49 | fixed case in trackback.scm where cycle prevented seeing call path check-in: 733a3a4bd6 user: bjbarcla tags: v1.64 | |
Modified codescanlib.scm from [85429a3289] to [bf0bf2c4eb].
︙ | ︙ | |||
44 45 46 47 48 49 50 | (if (atom? defname) ;; filter out things we dont understand (procname is a list, what??) (list defname filename args body) #f)] [else #f] ) scm-tree)))) procs)) | | | 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | (if (atom? defname) ;; filter out things we dont understand (procname is a list, what??) (list defname filename args body) #f)] [else #f] ) scm-tree)))) procs)) ;; given a sexp, return a flat list of atoms in that sexp (define (get-atoms-in-body body) (cond ((null? body) '()) ((atom? body) (list body)) (else (apply append (map get-atoms-in-body body))))) |
︙ | ︙ |
Modified common.scm from [071499421b] to [93c8a7d160].
︙ | ︙ | |||
529 530 531 532 533 534 535 | (3 "SKIP") (4 "WARN") (5 "WAIVED") (6 "CHECK") (7 "STUCK/DEAD") (8 "DEAD") (9 "FAIL") | > | | 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 | (3 "SKIP") (4 "WARN") (5 "WAIVED") (6 "CHECK") (7 "STUCK/DEAD") (8 "DEAD") (9 "FAIL") (10 "PREQ_FAIL") (11 "ABORT"))) (define *common:ended-states* ;; states which indicate the test is stopped and will not proceed '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE")) (define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD")) |
︙ | ︙ |
Modified db.scm from [144a083df6] to [879c965cf5].
|
| | | 1 2 3 4 5 6 7 8 | ;====================================================================== ;; Copyright 2006-2016, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR |
︙ | ︙ | |||
196 197 198 199 200 201 202 | (let ((syncprag (configf:lookup *configdat* "setup" "sychronous"))) (sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";")))) ;; open an sql database inside a file lock ;; returns: db existed-prior-to-opening ;; RA => Returns a db handler; sets the lock if opened in writable mode ;; | | | | 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 | (let ((syncprag (configf:lookup *configdat* "setup" "sychronous"))) (sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";")))) ;; open an sql database inside a file lock ;; returns: db existed-prior-to-opening ;; RA => Returns a db handler; sets the lock if opened in writable mode ;; ;; (define *db-open-mutex* (make-mutex)) (define (db:lock-create-open fname initproc) (let* ((parent-dir (or (pathname-directory fname)(current-directory))) ;; no parent? go local (raw-fname (pathname-file fname)) (dir-writable (file-write-access? parent-dir)) (file-exists (common:file-exists? fname)) (file-write (if file-exists (file-write-access? fname) dir-writable ))) ;; (mutex-lock! *db-open-mutex*) ;; tried this mutex, not clear it helped. (if file-write ;; dir-writable (condition-case (let* ((lockfname (conc fname ".lock")) (readyfname (conc parent-dir "/.ready-" raw-fname)) (readyexists (common:file-exists? readyfname))) (if (not readyexists) (common:simple-file-lock-and-wait lockfname)) |
︙ | ︙ | |||
244 245 246 247 248 249 250 | (exn (permission)(debug:print 0 *default-log-port* "ERROR: database " fname " has some permissions problem.")) (exn () (debug:print 0 *default-log-port* "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)))) (condition-case (begin (debug:print 2 *default-log-port* "WARNING: opening db in non-writable dir " fname) (let ((db (sqlite3:open-database fname))) | | | 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 | (exn (permission)(debug:print 0 *default-log-port* "ERROR: database " fname " has some permissions problem.")) (exn () (debug:print 0 *default-log-port* "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)))) (condition-case (begin (debug:print 2 *default-log-port* "WARNING: opening db in non-writable dir " fname) (let ((db (sqlite3:open-database fname))) ;; (mutex-unlock! *db-open-mutex*) db)) (exn (io-error) (debug:print 0 *default-log-port* "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.")) (exn (corrupt) (debug:print 0 *default-log-port* "ERROR: database " fname " is corrupt. Repair it to proceed.")) (exn (busy) (debug:print 0 *default-log-port* "ERROR: database " fname " is locked. Try copying to another location, remove original and copy back.")) (exn (permission)(debug:print 0 *default-log-port* "ERROR: database " fname " has some permissions problem.")) (exn () (debug:print 0 *default-log-port* "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)))) ))) |
︙ | ︙ | |||
1585 1586 1587 1588 1589 1590 1591 | ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCED')); (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) | | | 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 | ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCED')); (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) ;; FIXME suspect test run time & deadtime are not well matched; resulting in COMPLETED/DEAD status of an a-ok running test (deadtime (if (and deadtime-str (string->number deadtime-str)) (string->number deadtime-str) 7200))) ;; two hours (db:with-db dbstruct #f #f (lambda (db) |
︙ | ︙ | |||
1648 1649 1650 1651 1652 1653 1654 | (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE") (for-each (lambda (test-id) | | > | 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 | (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) (if (> (length all-ids) 0) (begin (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE") (for-each (lambda (test-id) (db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test failed to complete")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS. ref ticket 220546828 all-ids)))))))) ;; ALL REPLACED BY THE BLOCK ABOVE ;; ;; (sqlite3:execute ;; db ;; (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" |
︙ | ︙ | |||
3490 3491 3492 3493 3494 3495 3496 | (if (not (equal? state "DELETED")) (cons status (map dbr:counts-status state-status-counts)) (map dbr:counts-status state-status-counts))) *common:std-statuses* >)) (non-completes (filter (lambda (x) (not (equal? x "COMPLETED"))) all-curr-states)) | > > > | < < | > > | < < | < < | | > > > | | | > | > > > > > > > > > | > | 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 | (if (not (equal? state "DELETED")) (cons status (map dbr:counts-status state-status-counts)) (map dbr:counts-status state-status-counts))) *common:std-statuses* >)) (non-completes (filter (lambda (x) (not (equal? x "COMPLETED"))) all-curr-states)) (preq-fails (filter (lambda (x) (equal? x "PREQ_FAIL")) all-curr-statuses)) (num-non-completes (length non-completes)) (newstate (cond ((> running 0) "RUNNING") ;; anything running, call the situation running ((> (length preq-fails) 0) "NOT_STARTED") ((> bad-not-started 0) "COMPLETED") ;; we have an ugly situation, it is completed in the sense we cannot do more. ((> num-non-completes 0) (car non-completes)) ;; (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED (else (car all-curr-states)))) ;; (if (> running 0) ;; "RUNNING" ;; (if (> bad-not-started 0) ;; "COMPLETED" ;; (car all-curr-states)))) (newstatus (cond ((> (length preq-fails) 0) "PREQ_FAIL") ((or (> bad-not-started 0) (and (equal? newstate "NOT_STARTED") (> num-non-completes 0))) "STARTED") (else (car all-curr-statuses))))) ;; (print "bad-not-supported: " bad-not-support " all-curr-states: " all-curr-states " all-curr-statuses: " all-curr-states) ;; " newstate: " newstate " newstatus: " newstatus) ;; NB// Pass the db so it is part of the transaction (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path"> bad-not-started="bad-not-started" newstate="newstate" newstatus="newstatus" num-non-completes="num-non-completes" non-completes="non-completes "len(sscs)="(length state-status-counts) " state-status-counts: " (apply conc (map (lambda (x) (conc (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | ")) state-status-counts)) ); end debug:print (if tl-test-id (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct )))))) (mutex-unlock! *db-transaction-mutex*) (if (and test-id state status (equal? status "AUTO")) (db:test-data-rollup dbstruct run-id test-id status)) tr-res))))) ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status* (define (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path) (db:with-db |
︙ | ︙ |
Modified runs.scm from [df9cc9bbed] to [1e0df37501].
︙ | ︙ | |||
424 425 426 427 428 429 430 | ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; ;; What happended, this code is now duplicated in tests!? ;; ;;====================================================================== | | | | | | | | | 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 | ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; ;; What happended, this code is now duplicated in tests!? ;; ;;====================================================================== (if (not (null? test-names)) ;; BEGIN test-names loop (let loop ((hed (car test-names)) ;; NOTE: This is the main loop that iterates over the test-names (tal (cdr test-names))) ;; 'return-procs tells the config reader to prep running system but return a proc (change-directory *toppath*) ;; PLEASE OPTIMIZE ME!!! I think this should be a no-op but there are several places where change-directories could be happening. (setenv "MT_TEST_NAME" hed) ;; (let*-values (((waitons waitors config)(tests:get-waitons hed all-tests-registry))) (debug:print-info 8 *default-log-port* "waitons: " waitons) ;; check for hed in waitons => this would be circular, remove it and issue an ;; error (if (or (member hed waitons) (member hed waitors)) (begin (debug:print-error 0 *default-log-port* "test " hed " has listed itself as a waiton or waitor, please correct this!") (set! waitons (filter (lambda (x)(not (equal? x hed))) waitons)) (set! waitors (filter (lambda (x)(not (equal? x hed))) waitors)))) ;; (items (items:get-items-from-config config))) (if (not (hash-table-ref/default test-records hed #f)) (hash-table-set! test-records ;; BB: we are doing a manual make-tests:testqueue hed (vector hed ;; 0 ;; testname config ;; 1 waitons ;; 2 (config-lookup config "requirements" "priority") ;; priority 3 (tests:get-items config) ;; 4 ;; expand the [items] and or [itemstable] into explict items #f ;; itemsdat 5 #f ;; spare - used for item-path waitors ;; ))) (for-each (lambda (waiton) (if (and waiton (not (member waiton test-names))) (let* ((waiton-record (hash-table-ref/default test-records waiton #f)) (waiton-tconfig (if waiton-record (vector-ref waiton-record 1) #f)) (waiton-itemized (and waiton-tconfig (or (hash-table-ref/default waiton-tconfig "items" #f) (hash-table-ref/default waiton-tconfig "itemstable" #f)))) (itemmaps (tests:get-itemmaps config)) ;; (configf:lookup config "requirements" "itemmap")) (new-test-patts (tests:extend-test-patts test-patts hed waiton itemmaps))) ;; BB: items expanded here. (debug:print-info 0 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items") ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%" ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt ;; is this satisfied by merely appending "/" to the waiton name added to the list? ;; ;; This approach causes all of the items in an upstream test to be run |
︙ | ︙ | |||
501 502 503 504 505 506 507 | ;; (set! test-names (cons waiton test-names))))) ;; was an append, now a cons ))) (delete-duplicates (append waitons waitors))) (let ((remtests (delete-duplicates (append waitons tal)))) (if (not (null? remtests)) (begin ;; (debug:print-info 0 *default-log-port* "Preprocessing continues for " (string-intersperse remtests ", ")) | | | 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 | ;; (set! test-names (cons waiton test-names))))) ;; was an append, now a cons ))) (delete-duplicates (append waitons waitors))) (let ((remtests (delete-duplicates (append waitons tal)))) (if (not (null? remtests)) (begin ;; (debug:print-info 0 *default-log-port* "Preprocessing continues for " (string-intersperse remtests ", ")) (loop (car remtests)(cdr remtests)))))))) ;; END test-names loop (if (not (null? required-tests)) (debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue")) ;; NOTE: these are all parent tests, items are not expanded yet. (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records)) (let ((reglen (configf:lookup *configdat* "setup" "runqueue"))) (if (> (length (hash-table-keys test-records)) 0) |
︙ | ︙ | |||
530 531 532 533 534 535 536 | ;; (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn)) ;; (if (> run-queue-retries 0) ;; (begin ;; (set! run-queue-retries (- run-queue-retries 1)) ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)))) ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))) "runs:run-tests-queue")) | | | | 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 | ;; (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn)) ;; (if (> run-queue-retries 0) ;; (begin ;; (set! run-queue-retries (- run-queue-retries 1)) ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)))) ;; (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry))) "runs:run-tests-queue")) (th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions exn (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id) (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) (set! keep-going #f) (thread-join! th2) |
︙ | ︙ | |||
716 717 718 719 720 721 722 | #f (list (runs:queue-next-hed trimmed-tal trimmed-reg reglen regfull) (runs:queue-next-tal trimmed-tal trimmed-reg reglen regfull) (runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) reruns))) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) | | | | | 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 | #f (list (runs:queue-next-hed trimmed-tal trimmed-reg reglen regfull) (runs:queue-next-tal trimmed-tal trimmed-reg reglen regfull) (runs:queue-next-reg trimmed-tal trimmed-reg reglen regfull) reruns))) (list (car newtal)(append (cdr newtal) reg) '() reruns)))) ((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider (null? prereq-fails) (null? non-completed)) (if (runs:can-keep-running? hed 20) (begin (runs:inc-cant-run-tests hed) (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;; ;; getting here likely means the system is way overloaded, kill a full minute before continuing (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) ;; num-retries code was here ;; we use this opportunity to move contents of reg to tal (list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met? (begin (debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while."))) |
︙ | ︙ | |||
748 749 750 751 752 753 754 | (debug:print-info 1 *default-log-port* "test " hed " (mode=" testmode ") has failed prerequisite(s); " (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") ", removing it from to-do list") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (if (not (null? prereq-fails)) (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites") | > > | | 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 | (debug:print-info 1 *default-log-port* "test " hed " (mode=" testmode ") has failed prerequisite(s); " (string-intersperse (map (lambda (t)(conc (db:test-get-testname t) ":" (db:test-get-state t)"/"(db:test-get-status t))) fails) ", ") ", removing it from to-do list") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (if (not (null? prereq-fails)) (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_DISCARDED" "Failed to run due to prior failed prerequisites") (begin (debug:print 4 *default-log-port*"BB> set PREQ_FAIL on "hed) (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))))) ;; BB: this works, btu equivalent for itemwait mode does not work. (if (or (not (null? reg))(not (null? tal))) (begin (hash-table-set! test-registry hed 'CANNOTRUN) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) (cons hed reruns))) |
︙ | ︙ | |||
792 793 794 795 796 797 798 799 800 801 802 803 804 805 | (conc t)))) inlst))) ;; hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry itemmaps) (define (runs:process-expanded-tests runsdat testdat) ;; unroll the contents of runsdat and testdat (due to ongoing refactoring). (let* ((hed (runs:testdat-hed testdat)) (tal (runs:testdat-tal testdat)) (reg (runs:testdat-reg testdat)) (reruns (runs:testdat-reruns testdat)) (test-name (runs:testdat-test-name testdat)) (item-path (runs:testdat-item-path testdat)) (jobgroup (runs:testdat-jobgroup testdat)) | > > > | 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 | (conc t)))) inlst))) ;; hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry itemmaps) (define (runs:process-expanded-tests runsdat testdat) ;; unroll the contents of runsdat and testdat (due to ongoing refactoring). (debug:print 2 *default-log-port* "runs:process-expanded-tests; testdat:" ) (debug:print 2 *default-log-port* (with-output-to-string (lambda () (pp (runs:testdat->alist testdat) )))) (let* ((hed (runs:testdat-hed testdat)) (tal (runs:testdat-tal testdat)) (reg (runs:testdat-reg testdat)) (reruns (runs:testdat-reruns testdat)) (test-name (runs:testdat-test-name testdat)) (item-path (runs:testdat-item-path testdat)) (jobgroup (runs:testdat-jobgroup testdat)) |
︙ | ︙ | |||
863 864 865 866 867 868 869 | (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch? (debug:print-info 4 *default-log-port* "run-limits-info = " run-limits-info) | | | 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 | (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) (debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) ;; Don't know at this time if the test have been launched at some time in the past ;; i.e. is this a re-launch? (debug:print-info 4 *default-log-port* "run-limits-info = " run-limits-info) (cond ; cond 894- 1067 ;; Check item path against item-patts, ;; ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run ;; else the run is stuck, temporarily or permanently ;; but should check if it is due to lack of resources vs. prerequisites (debug:print-info 1 *default-log-port* "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) |
︙ | ︙ | |||
985 986 987 988 989 990 991 | ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 *default-log-port* "Waiting for more work to do...")) (thread-sleep! 1) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (if (or (not (null? reg))(not (null? tal))) | | > > > > | > | > | | | | 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 | ;; couldn't run, take a breather (if (runs:lownoise "Waiting for more work to do..." 60) (debug:print-info 0 *default-log-port* "Waiting for more work to do...")) (thread-sleep! 1) (list (car newtal)(cdr newtal) reg reruns)) ;; the waiton is FAIL so no point in trying to run hed ever again (if (or (not (null? reg))(not (null? tal))) (if (or (vector? hed) (not (null? fails))) ;; BB: why do we need a vector? in my case, fails is populated (prereq failed), reg is not nul, and we really want to drop this one (begin (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path " from the launch list as it has prerequistes that are FAIL") (let ((test-id (rmt:get-test-id run-id hed ""))) (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) ;; This next is for the items (if (not (null? fails)) ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f) (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "PREQ_FAIL" #f) ;;(mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) (rmt:set-state-status-and-roll-up-items run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) ) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) (list (runs:queue-next-hed tal reg reglen regfull) (runs:queue-next-tal tal reg reglen regfull) (runs:queue-next-reg tal reg reglen regfull) reruns ;; WAS: (cons hed reruns) ;; but that makes no sense? )) (let ((nth-try (hash-table-ref/default test-registry hed 0))) ;; hed not a vector... (debug:print 2 *default-log-port* "nth-try("hed")="nth-try) (cond ((member "RUNNING" (map db:test-get-state prereqs-not-met)) (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet.")) (thread-sleep! 4) (list (runs:queue-next-hed newtal reg reglen regfull) (runs:queue-next-tal newtal reg reglen regfull) (runs:queue-next-reg newtal reg reglen regfull) reruns)) ((or (not nth-try) ;; BB: condition on subsequent tries, condition below fires on first try (and (number? nth-try) (< nth-try 10))) (hash-table-set! test-registry hed (if (number? nth-try) (+ nth-try 1) 0)) (if (runs:lownoise (conc "not removing test " hed) 60) (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites")) ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (list hed tal reg reruns) ;; (list (car newtal)(cdr newtal) reg reruns) ;; (hash-table-set! test-registry hed 'removed) (list (runs:queue-next-hed newtal reg reglen regfull) (runs:queue-next-tal newtal reg reglen regfull) (runs:queue-next-reg newtal reg reglen regfull) reruns)) ((symbol? nth-try) ;; BB: 'done matches here in one case where prereq itemwait failed. This is first "try" (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW (if (null? tal) #f ;; yes, really (list (car tal)(cdr tal) reg reruns)) (begin (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60) (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state >" nth-try "< will be overridden and we'll retry.")) (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f) (hash-table-set! test-registry hed 0) (list (runs:queue-next-hed newtal reg reglen regfull) (runs:queue-next-tal newtal reg reglen regfull) (runs:queue-next-reg newtal reg reglen regfull) reruns)))) (else |
︙ | ︙ | |||
1165 1166 1167 1168 1169 1170 1171 | (debug:print 5 *default-log-port* "test-records: " test-records ", flags: " (hash-table->alist flags)) ;; Do mark-and-find clean up of db before starting runing of quue ;; ;; (rmt:find-and-mark-incomplete) (let* ((run-info (rmt:get-run-info run-id)) | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 | (debug:print 5 *default-log-port* "test-records: " test-records ", flags: " (hash-table->alist flags)) ;; Do mark-and-find clean up of db before starting runing of quue ;; ;; (rmt:find-and-mark-incomplete) (let* ((run-info (rmt:get-run-info run-id)) (tests-info (mt:get-tests-for-run run-id #f '() '())) ;; qryvals: "id,testname,item_path")) (sorted-test-names (tests:sort-by-priority-and-waiton test-records)) (test-registry (make-hash-table)) (registry-mutex (make-mutex)) (num-retries 0) (max-retries (config-lookup *configdat* "setup" "maxretries")) (max-concurrent-jobs (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50)) (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle (last-time-some-running (current-seconds)) ;; (tdbdat (tasks:open-db)) (runsdat (make-runs:dat ;; hed: hed ;; tal: tal ;; reg: reg ;; reruns: reruns reglen: reglen regfull: #f ;; regfull ;; test-record: test-record runname: runname ;; test-name: test-name ;; item-path: item-path ;; jobgroup: jobgroup max-concurrent-jobs: max-concurrent-jobs run-id: run-id ;; waitons: waitons ;; testmode: testmode test-patts: test-patts required-tests: required-tests test-registry: test-registry registry-mutex: registry-mutex flags: flags keyvals: keyvals run-info: run-info ;; newtal: newtal all-tests-registry: all-tests-registry ;; itemmaps: itemmaps ;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps) ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running ))) ;; Initialize the test-registery hash with tests that already have a record ;; convert state to symbol and use that as the hash value (for-each (lambda (trec) (let ((id (db:test-get-id trec)) (tn (db:test-get-testname trec)) (ip (db:test-get-item-path trec)) |
︙ | ︙ | |||
1282 1283 1284 1285 1286 1287 1288 | ;; (server:ping (remote-server-url *runremote*))) ;; (server:check-if-running *toppath*)))) ;; (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) | | | | 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 | ;; (server:ping (remote-server-url *runremote*))) ;; (server:check-if-running *toppath*)))) ;; (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1))) ;; (debug:print 0 *default-log-port* "max-tries-hash: " (hash-table->alist *max-tries-hash*)) ;; Ensure all top level tests get registered. This way they show up as "NOT_STARTED" on the dashboard ;; and it is clear they *should* have run but did not. (if (not (hash-table-ref/default test-registry (db:test-make-full-name test-name "") #f)) (begin (rmt:register-test run-id test-name "") |
︙ | ︙ | |||
1441 1442 1443 1444 1445 1446 1447 | ((not (null? tal)) (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here.")) ((not (null? reg)) ;; could we get here with leftovers? (debug:print-info 0 *default-log-port* "Have leftovers!") (loop (car reg)(cdr reg) '() reruns)) (else (debug:print-info 4 *default-log-port* "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) | > | | 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 | ((not (null? tal)) (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here.")) ((not (null? reg)) ;; could we get here with leftovers? (debug:print-info 0 *default-log-port* "Have leftovers!") (loop (car reg)(cdr reg) '() reruns)) (else (debug:print-info 4 *default-log-port* "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) (thread-sleep! 5) ;; I think there is a race condition here. Let states/statuses settle (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (prev-num-running 0)) ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running) (if (and (or (args:get-arg "-run-wait") |
︙ | ︙ |
Modified server.scm from [45bfab59ba] to [40df68dca1].
︙ | ︙ | |||
456 457 458 459 460 461 462 | (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) (no-sync-db (db:open-no-sync-db)) (sync-duration 0) ;; run time of the sync in milliseconds | | > | | 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 | (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) (no-sync-db (db:open-no-sync-db)) (sync-duration 0) ;; run time of the sync in milliseconds ;;(this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x))) ) (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls (debug:print-info 2 *default-log-port* "Periodic sync thread started.") (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id) );; " this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) (let* (;;(dbstruct (db:setup)) (mtdb (dbr:dbstruct-mtdb dbstruct)) (mtpath (db:dbdat-get-path mtdb)) (tmp-area (common:get-db-tmp-area)) (start-file (conc tmp-area "/.start-sync")) (end-file (conc tmp-area "/.end-sync"))) |
︙ | ︙ | |||
577 578 579 580 581 582 583 | (begin (thread-sleep! 1) (delay-loop (+ count 1)))) (if (not *time-to-exit*) (loop)))) ;; time to exit, close the no-sync db here (db:no-sync-close-db no-sync-db) (if (common:low-noise-print 30) | | | 578 579 580 581 582 583 584 585 586 | (begin (thread-sleep! 1) (delay-loop (+ count 1)))) (if (not *time-to-exit*) (loop)))) ;; time to exit, close the no-sync db here (db:no-sync-close-db no-sync-db) (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num))))))) |