︙ | | |
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
|
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
|
-
+
+
-
+
+
|
(let fatal-loop ((count 0))
(handle-exceptions
exn
(let ((call-chain (get-call-chain))
(msg ((condition-property-accessor 'exn 'message) exn)))
(if (< count 5)
(begin ;; this call is colliding, do some crude stuff to fix it.
(debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count)
(debug:print 0 *default-log-port* "ERROR: *configdat* was inaccessible! This should never happen. Retry #" count
", exn=" exn)
(launch:setup force-reread: #t)
(fatal-loop (+ count 1)))
(begin
(debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count " times. Message: " msg)
(debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count
" times. Message: " msg)
(debug:print 0 *default-log-port* "Call chain:")
(with-output-to-port *default-log-port*
(lambda ()
(print "*configdat* is >>"*configdat*"<<")
(pp *configdat*)
(pp call-chain)))
|
︙ | | |
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
|
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
|
-
+
+
+
|
(args:get-arg "-one-pass"))
(exit 0))
(thread-sleep! (cond ;; BB: check with Matt. Should this sleep move
;; to cond clauses below where we determine we
;; have too many jobs running rather than each
;; time the and condition above is true (which
;; seems like always)?
;; seems like always)?
((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
0)
((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
(if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
10) ;; obviously haven't had any work to do for a while
(else 0)))
;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;; (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;; )))
|
︙ | | |
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
|
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
|
-
+
-
+
|
(full-log-fname (conc log-dir "/" log-file)))
(if run-pre-hook
(if (null? existing-tests)
(let* ((use-log-dir (if (not (directory-exists? log-dir))
(handle-exceptions
exn
(begin
(debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir)
(debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir ", exn=" exn)
#f)
(create-directory log-dir #t)
#t)
#t))
(start-time (current-seconds))
(actual-logf (if use-log-dir full-log-fname log-file)))
(handle-exceptions
exn
(begin
(print-call-chain *default-log-port*)
(debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn))
(debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
(debug:print 0 *default-log-port* "ERROR: failed to run pre-hook " run-pre-hook ", check the log " log-file))
(debug:print-info 0 *default-log-port* "running run-pre-hook: \"" run-pre-hook "\", log is " actual-logf)
(system (conc run-pre-hook " >> " actual-logf " 2>&1"))
(debug:print-info 0 *default-log-port* "pre-hook \"" run-pre-hook "\" took " (- (current-seconds) start-time) " seconds to run.")))
(debug:print 0 *default-log-port* "Skipping pre-hook call \"" run-pre-hook "\" as there are existing tests for this run.")))))
(define (runs:run-post-hook run-id)
|
︙ | | |
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
|
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
|
-
+
-
+
|
(if run-post-hook
;; (if (null? existing-tests)
;; (debug:print 0 *default-log-port* "Skipping post-hook call \"" run-post-hook "\" as there are existing tests for this run.")))))
(let* ((use-log-dir (if (not (directory-exists? log-dir))
(handle-exceptions
exn
(begin
(debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir)
(debug:print 0 *default-log-port* "WARNING: Failed to create " log-dir ", exn=" exn)
#f)
(create-directory log-dir #t)
#t)
#t))
(start-time (current-seconds))
(actual-logf (if use-log-dir full-log-fname log-file)))
(handle-exceptions
exn
(begin
(print-call-chain *default-log-port*)
(debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn))
(debug:print 0 *default-log-port* "Message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
(debug:print 0 *default-log-port* "ERROR: failed to run post-hook " run-post-hook ", check the log " log-file))
(debug:print-info 0 *default-log-port* "running run-post-hook: \"" run-post-hook "\", log is " actual-logf)
(system (conc run-post-hook " >> " actual-logf " 2>&1"))
(debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run."))))))
;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise.
(define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon)
|
︙ | | |
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
|
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
|
-
+
|
(th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here?
;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
(let ((run-ids (rmt:get-all-run-ids)))
(for-each (lambda (run-id)
(if keep-going
(handle-exceptions
exn
(debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id)
(debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
(rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
run-ids)))
"runs: mark-incompletes")))
;; (thread-start! th1)
(thread-start! th2)
;; (thread-join! th1)
;; just do the main stuff in the main thread
|
︙ | | |
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
|
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
|
-
+
|
;; If no resources are available just kill time and loop again
;;
((not have-resources) ;; simply try again after waiting a second
(if (runs:lownoise "no resources" 60)
(debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ..."))
;; Have gone back and forth on this but db starvation is an issue.
;; wait one second before looking again to run jobs.
(thread-sleep! 1)
(thread-sleep! 0.25)
;; could have done hed tal here but doing car/cdr of newtal to rotate tests
(list (car newtal)(cdr newtal) reg reruns))
;; This is the final stage, everything is in place so launch the test
;;
((and have-resources
(or (null? prereqs-not-met)
|
︙ | | |
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
|
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
|
+
-
+
|
;; jobtools maxload is useful for where the full Megatest run is done on one machine
(if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
(common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
(if maxhomehostload
(common:wait-for-homehost-load maxhomehostload
(common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
(conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
(run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry)
(runs:incremental-print-results run-id)
(hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
(runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
;; (thread-sleep! *global-delta*)
(if (or (not (null? tal))(not (null? reg)))
|
︙ | | |
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
|
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
|
+
+
+
|
))
;; every time though the loop increment the test/itempatt val.
;; when the min is > max-allowed and none running then force exit
;;
(define *max-tries-hash* (make-hash-table))
(define (runs:pretty-long-list lst)
(if (> (length lst) 8)(append (take lst 3)(list "...")) lst))
;;======================================================================
;; runs:run-tests-queue is called by runs:run-tests
;;======================================================================
;;
;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
(define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in all-tests-registry)
;; At this point the list of parent tests is expanded
|
︙ | | |
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
|
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
|
-
+
|
reruns))))
;; (loop (car tal)(cdr tal) reg reruns))))
(runs:incremental-print-results run-id)
(debug:print 4 *default-log-port* "TOP OF LOOP => "
"test-name: " test-name
"\n hed: " hed
"\n tal: " tal
"\n tal: " (runs:pretty-long-list tal)
"\n reg: " reg
"\n test-record " test-record
"\n itemdat: " itemdat
"\n items: " items
"\n item-path: " item-path
"\n waitons: " waitons
"\n num-retries: " num-retries
|
︙ | | |
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
|
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
|
+
+
+
+
+
+
|
((not items)
(debug:print-info 4 *default-log-port* "cond branch - " "rtq-2")
(debug:print-info 4 *default-log-port* "OUTER COND: (not items)")
(if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
(not (null? tal)))
(loop (car tal)(cdr tal) reg reruns))
(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
;; This would be a good place to block on homehost load
(runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
(let ((loop-list (runs:process-expanded-tests runsdat testdat)))
(if loop-list (apply loop loop-list))))
;; items processed into a list but not came in as a list been processed
;;
((and (list? items) ;; thus we know our items are already calculated
|
︙ | | |
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
|
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
|
-
+
|
(begin
(mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test)
(debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test))
;;
;; Here the test is handed off to launch.scm for launch-test to complete the launch process
;;
(if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
(begin
(begin
(print "ERROR: Failed to launch the test. Exiting as soon as possible")
(set! *globalexitstatus* 1) ;;
(process-signal (current-process-id) signal/kill))))))))
((KILLED)
(debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.")
(hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED))
((LAUNCHED REMOTEHOSTSTART RUNNING)
|
︙ | | |
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
|
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
|
-
+
|
;;
(define (runs:get-hash-by-target target-patts runpatt)
(let* ((targets (string-split target-patts ","))
(keys (rmt:get-keys))
(res-ht (make-hash-table))) ;; target -> ( runrecord1 runrecord2 ... )
(for-each
(lambda (target-patt)
(let ((runs (rmt:simple-get-runs runpatt #f #f target-patt)))
(let ((runs (rmt:simple-get-runs runpatt #f #f target-patt #f)))
(for-each
(lambda (run)
(let ((target (simple-run-target run)))
(hash-table-set! res-ht target (cons run (hash-table-ref/default res-ht target '())))))
runs)))
targets)
res-ht))
|
︙ | | |
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
|
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
|
-
+
-
+
|
(debug:print 0 *default-log-port* "WARNING: directory " real-dir " does not exist")
(debug:print 0 *default-log-port* "WARNING: no real directory corrosponding to link " run-dir ", nothing done")))
(if (symbolic-link? run-dir)
(begin
(debug:print-info 1 *default-log-port* "Removing symlink " run-dir)
(handle-exceptions
exn
(debug:print-error 0 *default-log-port* " Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
(debug:print-error 0 *default-log-port* " Failed to remove symlink " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue, exn=" exn)
(delete-file run-dir)))
(if (directory? run-dir)
(if (> (directory-fold (lambda (f x)(+ 1 x)) 0 run-dir) 0)
(debug:print 0 *default-log-port* "WARNING: refusing to remove " run-dir " as it is not empty")
(handle-exceptions
exn
(debug:print-error 0 *default-log-port* " Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue")
(debug:print-error 0 *default-log-port* " Failed to remove directory " run-dir ((condition-property-accessor 'exn 'message) exn) ", attempting to continue, exn=" exn)
(delete-directory run-dir)))
(if (and run-dir
(not (member run-dir (list "n/a" "/tmp/badname"))))
(debug:print 0 *default-log-port* "WARNING: not removing " run-dir " as it either doesn't exist or is not a symlink")
(debug:print 0 *default-log-port* "NOTE: the run dir for this test is undefined. Test may have already been deleted."))
))
;; Only delete the records *after* removing the directory. If things fail we have a record
|
︙ | | |
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
|
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
|
-
+
|
fail-cnt)))
(if (null? tail)
(let* ((final-doc ((sxml-modify `("testsuite" insert-into (@ (name ,xml-ts-name) (tests ,tests-count) (errors ,error-cnt) (failures ,fail-cnt)))) new-doc)))
(debug:print 0 *default-log-port* "modify attrib error=" error-cnt " fail= " fail-cnt)
(handle-exceptions
exn
(let* ((msg ((condition-property-accessor 'exn 'message) exn)))
(debug:print 0 *default-log-port* (conc "WARNING: Failed to update file" xml-path". Message:" msg)))
(debug:print 0 *default-log-port* (conc "WARNING: Failed to update file" xml-path". Message:" msg ", exn=" exn)))
(if (not (file-exists? xml-dir))
(create-directory xml-dir #t))
(if (not (rmt:no-sync-get/default keyname #f))
(begin
(rmt:no-sync-set keyname "on")
(debug:print 0 *default-log-port* "creating xml at " xml-path)
|
︙ | | |
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
|
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
|
-
+
|
(debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.")
(begin
(debug:print-info 0 *default-log-port* "Removing cached files:\n " (string-intersperse files "\n "))
(for-each
(lambda (f)
(handle-exceptions
exn
(debug:print 0 *default-log-port* "WARNING: Failed to remove file " f)
(debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn)
(delete-file f)))
files))))
(debug:print-error 0 *default-log-port* "-clean-cache requires -runname."))
(debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))
|