︙ | | |
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
|
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
|
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
(last-jobs-check-time 0)
)
(defstruct runs:testdat
hed tal reg reruns test-record
test-name item-path jobgroup
waitons testmode newtal itemmaps prereqs-not-met)
(module runsmod
(
runs:wait-if-seen-recently
)
(import scheme chicken data-structures extras files)
(import posix typed-records srfi-18 srfi-69
md5 message-digest
regex srfi-1)
(define *last-seen-ht* (make-hash-table))
(define (runs:wait-if-seen-recently wait-until . keys)
(let* ((full-key (string-intersperse keys "-"))
(last-seen (hash-table-ref/default *last-seen-ht* full-key 0))
(now (current-seconds))
(delta (- now last-seen))
(needed (if (< delta wait-until)
0
(- wait-until delta))))
(if (> needed 0)(thread-sleep! needed))
(hash-table-set! *last-seen-ht* full-key (current-seconds))
needed))
)
(import runsmod)
;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files
;; - remove any that are over 3600 seconds old
;; - if there are any that are younger than 10 seconds
;; * sleep 10 seconds
;; * touch my key-host-pid.softlock file
;; * return
;; - if there are no files younger than 10 seconds
|
︙ | | |
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
|
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
|
-
+
|
;; Take advantage of a good place to exit if running the one-pass methodology
(if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
(args:get-arg "-one-pass"))
(exit 0))
(if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
(let* ((num-running (rmt:get-count-tests-running run-id))
(num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
(job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
(if (string? jobg-count)
(string->number jobg-count)
jobg-count))))
(if (> (+ num-running num-running-in-jobgroup) 0)
(runs:inc-can-run-more-tests-count runsdat)) ;; (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1)))
|
︙ | | |
433
434
435
436
437
438
439
440
441
442
443
444
445
446
|
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
|
+
+
|
(debug:print-info 0 *default-log-port* "running run-post-hook: \"" run-post-hook "\", log is " actual-logf)
(system (conc run-post-hook " >> " actual-logf " 2>&1"))
(debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run."))))))
;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise.
(define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon)
(null? (tests:filter-test-names-not-matched waitors-upon test-patt)))
(define *find-and-mark-incomplete-last-run* (make-hash-table))
;;======================================================================
;; runs:run-tests is called from megatest.scm and itself
;;======================================================================
;;
;; test-names: Comma separated patterns same as test-patts but used in selection
;; of tests to run. The item portions are not respected.
|
︙ | | |
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
|
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
|
-
-
-
-
-
-
-
+
+
+
+
+
+
+
|
;; run the run prehook if there are no tests yet run for this run:
;;
(runs:run-pre-hook run-id)
;; mark all test launced flag as false in the meta table
(rmt:set-var (conc "lunch-complete-" run-id) "no")
(debug:print-info 1 *default-log-port* "Setting end-of-run to no")
(let* ((config-reruns (let ((x (configf:lookup *configdat* "setup" "reruns")))
(if x (string->number x) #f)))
(config-rerun-cnt (if config-reruns
config-reruns
1)))
(if (eq? config-rerun-cnt run-count)
(rmt:set-var (conc "end-of-run-" run-id) "no")))
(if x (string->number x) #f)))
(config-rerun-cnt (if config-reruns
config-reruns
1)))
(if (eq? config-rerun-cnt run-count)
(rmt:set-var (conc "end-of-run-" run-id) "no")))
(rmt:set-run-state-status run-id "new" "n/a")
;; now add non-directly referenced dependencies (i.e. waiton)
;;======================================================================
;; refactoring this block into tests:get-full-data
;;
;; What happended, this code is now duplicated in tests!?
;;
|
︙ | | |
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
|
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
|
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
-
-
-
+
+
|
(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
;; NOTE: these are all parent tests, items are not expanded yet.
(debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
(let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
(if (> (length (hash-table-keys test-records)) 0)
(let* ((keep-going #t)
(run-queue-retries 5)
#;(th1 (make-thread (lambda ()
(handle-exceptions
exn
(begin
(print-call-chain)
(print " message: " ((condition-property-accessor 'exn 'message) exn)))
(runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
(any->number reglen) all-tests-registry)))
"runs:run-tests-queue"))
(th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here?
;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
(let ((run-ids (rmt:get-all-run-ids)))
(for-each (lambda (run-id)
(if keep-going
(handle-exceptions
exn
(debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
(rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
exn
(debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
;; lets run this only if a run has been NOT seen for more than 900 seconds
(if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900)
(begin
(rmt:find-and-mark-incomplete run-id #f)
(hash-table-set! *find-and-mark-incomplete-last-run* run-id (current-seconds)))
)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
run-ids)))
"runs: mark-incompletes")))
;; (thread-start! th1)
(thread-start! th2)
;; (thread-join! th1)
;; just do the main stuff in the main thread
(runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
(any->number reglen) all-tests-registry)
(set! keep-going #f)
(thread-join! th2)
;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD
(if (> run-count 0) ;; handle reruns
(begin
(if (not (hash-table-ref/default flags "-preclean" #f))
(hash-table-set! flags "-preclean" #t))
(if (not (hash-table-ref/default flags "-rerun" #f))
(hash-table-set! flags "-rerun" "STUCK/DEAD,n/a,ZERO_ITEMS"))
;; recursive call to self
(runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
(launch:end-of-run-check run-id)))
(runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
(launch:end-of-run-check run-id)))
(debug:print-info 0 *default-log-port* "No tests to run")))
(debug:print-info 4 *default-log-port* "All done by here")
;; TODO: try putting post hook call here
; (debug:print-info 2 *default-log-port* " run-count " run-count)
; (runs:run-post-hook run-id))
; (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count ))
|
︙ | | |
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
|
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
|
-
+
|
;; If no resources are available just kill time and loop again
;;
((not have-resources) ;; simply try again after waiting a second
(if (runs:lownoise "no resources" 60)
(debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ..."))
;; Have gone back and forth on this but db starvation is an issue.
;; wait one second before looking again to run jobs.
(thread-sleep! 0.25)
(thread-sleep! 1) ;; changed back to 1 from 0.25
;; could have done hed tal here but doing car/cdr of newtal to rotate tests
(list (car newtal)(cdr newtal) reg reruns))
;; This is the final stage, everything is in place so launch the test
;;
((and have-resources
(or (null? prereqs-not-met)
|
︙ | | |
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
|
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
|
-
+
|
))
extras)
extras)
'())))
(waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?))
(newtal (append tal (list hed)))
(regfull (>= (length reg) reglen))
(num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
(num-running (rmt:get-count-tests-running-for-run-id run-id))
(testdat (make-runs:testdat
hed: hed
tal: tal
reg: reg
reruns: reruns
test-record: test-record
test-name: test-name
|
︙ | | |
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
|
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
|
+
+
+
|
;; wait for load here
(if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
(- remtries 1)))))))
)))))
;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
(let ((waited (runs:wait-if-seen-recently 5 "prereqs-not-met" hed item-path))) ;; if we've been down this path in the past 5 seconds - wait out the difference
(if (> waited 0)(debug:print 0 *default-log-port* "Waited for prereqs-not-met-"hed"-"item-path" for " waited "seconds.")))
(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
(runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
(let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
(if loop-list (apply loop loop-list))))
|
︙ | | |
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
|
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
|
-
+
+
-
-
+
+
-
+
+
-
+
|
(rmt:set-var (conc "lunch-complete-" run-id) "yes")
;; now *if* -run-wait we wait for all tests to be done
;; Now wait for any RUNNING tests to complete (if in run-wait mode)
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
(let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
(let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id))
(prev-num-running 0))
;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
(if (and (or (args:get-arg "-run-wait")
(equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
(> num-running 0))
(begin
;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
(if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900)
(if (> (current-seconds)(+ last-time-incomplete 900))
(let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no
;; (begin(if (> (current-seconds)(+ last-time-incomplete 900))
(let ((actual-num-running num-running)) ;; (rmt:get-count-tests-running-for-run-id run-id))) ;; why call it again?
(debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
". Running as pid " (current-process-id) " on " (get-host-name))
(set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
;; (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
(rmt:find-and-mark-incomplete run-id #f)
(hash-table-set! *find-and-mark-incomplete-last-run* run-id (current-seconds))
(debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
" tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
(time->string (seconds->local-time (current-seconds))))))
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
(wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
(wait-loop (rmt:get-count-tests-running-for-run-id run-id)
num-running))))
;; LET* ((test-record
;; we get here on "drop through". All done!
;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed.
;; (debug:print-info 0 *default-log-port* "Calling Post Hook")
;; (runs:run-post-hook run-id)
(debug:print-info 1 *default-log-port* "All tests launched")))
|
︙ | | |