Megatest

Diff
Login

Differences From Artifact [89f36cc1cb]:

To Artifact [d6ca8ef849]:


98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
		(if (runs:lownoise "runners-softlock-wait" 360)
		    (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time..."))
		(thread-sleep! 2))
	      (begin
		(if (runs:lownoise "runners-softlock-nowait" 360)
		    (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock"))
		(let* ((ouf (open-output-file my-lock-file)))
		  (with-output-to-port ouf
		    (lambda ()(print (current-seconds))))
		  (close-output-port ouf))))
	  (runs:dat-last-fuel-check-set! rdat (current-seconds))))))
  
;; Fourth try, do accounting through time
;;
(define (runs:parallel-runners-mgmt rdat)
  (let ((time-to-check (configf:lookup-number *configdat* "runners" "time-to-check" default: 10)) ;; 28
	(time-to-wait  (configf:lookup-number *configdat* "runners" "time-to-wait" default: 30))
	(now-time      (current-seconds)))
    (if (> (- now-time (runs:dat-last-fuel-check rdat)) time-to-check) ;; time to check
	(runs:wait-on-softlock rdat "runners"))))







|
<



|







98
99
100
101
102
103
104
105

106
107
108
109
110
111
112
113
114
115
116
		(if (runs:lownoise "runners-softlock-wait" 360)
		    (debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time..."))
		(thread-sleep! 2))
	      (begin
		(if (runs:lownoise "runners-softlock-nowait" 360)
		    (debug:print-info 0 *default-log-port* "No runners in flight, updating softlock"))
		(let* ((ouf (open-output-file my-lock-file)))
		  (with-output-to-port ouf (lambda ()(print (current-seconds))))

		  (close-output-port ouf))))
	  (runs:dat-last-fuel-check-set! rdat (current-seconds))))))
  
;; Fourth try, do accounting through time....
;;
(define (runs:parallel-runners-mgmt rdat)
  (let ((time-to-check (configf:lookup-number *configdat* "runners" "time-to-check" default: 10)) ;; 28
	(time-to-wait  (configf:lookup-number *configdat* "runners" "time-to-wait" default: 30))
	(now-time      (current-seconds)))
    (if (> (- now-time (runs:dat-last-fuel-check rdat)) time-to-check) ;; time to check
	(runs:wait-on-softlock rdat "runners"))))
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
		  (launch:setup force-reread: #t)
		  (fatal-loop (+ count 1))) 
		(begin
		  (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count
			       " times. Message: " msg)
		  (debug:print 0 *default-log-port* "Call chain:")
		  (with-output-to-port *default-log-port*

                    (lambda ()
                      (print "*configdat* is >>"*configdat*"<<")
                      (pp *configdat*)
                      (pp call-chain)))
                  
		  (exit 1))))
          ;;(bb-check-path msg: "runs:set-megatest-env-vars block 1.5")
          (when (or (not *configdat*) (not (hash-table? *configdat*)))
              (debug:print 0 *default-log-port* "WARNING: *configdat* was inaccessible! This should never happen.  Brute force reread.")







<
|
|







225
226
227
228
229
230
231

232
233
234
235
236
237
238
239
240
		  (launch:setup force-reread: #t)
		  (fatal-loop (+ count 1))) 
		(begin
		  (debug:print 0 *default-log-port* "FATAL: *configdat* was inaccessible! This should never happen. Retried " count
			       " times. Message: " msg)
		  (debug:print 0 *default-log-port* "Call chain:")
		  (with-output-to-port *default-log-port*

		    (lambda ()
		      (print "*configdat* is >>"*configdat*"<<")
                      (pp *configdat*)
                      (pp call-chain)))
                  
		  (exit 1))))
          ;;(bb-check-path msg: "runs:set-megatest-env-vars block 1.5")
          (when (or (not *configdat*) (not (hash-table? *configdat*)))
              (debug:print 0 *default-log-port* "WARNING: *configdat* was inaccessible! This should never happen.  Brute force reread.")
310
311
312
313
314
315
316















317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
	(currtime (current-seconds)))
    (if (> (- currtime lasttime) waitval)
	(begin
	  (hash-table-set! *runs:denoise* key currtime)
	  #t)
	#f)))
















(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)

  ;; Take advantage of a good place to exit if running the one-pass methodology
  (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
	   (args:get-arg "-one-pass"))
      (exit 0))

  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))

  (let* ((num-running             (rmt:get-count-tests-running run-id))
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
    (if (> (+ num-running num-running-in-jobgroup) 0)
	(runs:inc-can-run-more-tests-count runsdat)) ;; (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>











|







308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
	(currtime (current-seconds)))
    (if (> (- currtime lasttime) waitval)
	(begin
	  (hash-table-set! *runs:denoise* key currtime)
	  #t)
	#f)))

(define *last-test-launch* 0)
(define *too-soon-delays* (make-hash-table))

;; to-soon delay, when matching event happened in less than dseconds delay wseconds
;;
(define (runs:too-soon-delay key dseconds wseconds)
  (let* ((last-time (hash-table-ref/default *too-soon-delays* key #f)))
    (if (and last-time
	     (< (- (current-seconds) last-time) dseconds))
	(begin
	  (if (runs:lownoise (conc "too-soon-delay"key) 60)
	      (debug:print-info 2 *default-log-port* "Polling throttle for "key))
	  (thread-sleep! wseconds)))
    (hash-table-set! *too-soon-delays* key (current-seconds))))

(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)

  ;; Take advantage of a good place to exit if running the one-pass methodology
  (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
	   (args:get-arg "-one-pass"))
      (exit 0))

  (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))

  (let* ((num-running             (rmt:get-count-tests-running run-id))
	 (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
	 (job-group-limit         (let ((jobg-count (configf:lookup-number *configdat* "jobgroups" jobgroup)))
				    (if (string? jobg-count)
					(string->number jobg-count)
					jobg-count))))
    (if (> (+ num-running num-running-in-jobgroup) 0)
	(runs:inc-can-run-more-tests-count runsdat)) ;; (set! *runs:can-run-more-tests-count* (+ *runs:can-run-more-tests-count* 1)))
    (if (not (eq? *last-num-running-tests* num-running))
	(begin
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
;; run-count is passed from megatest.scm as configf:lookup *configdat* "setup" "reruns", or defaults to 1.
(define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names
  (let* ((keys               (keys:config-get-fields *configdat*))
	 (keyvals            (keys:target->keyval keys target))
	 (run-id             (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour")))  ;;  test-name)))
	 ;; (deferred          '()) ;; delay running these since they have a waiton clause
	 (runconfigf         (conc  *toppath* "/runconfigs.config"))
         (dbfile             (conc  *toppath* "/megatest.db"))
         (readonly-mode      (not (file-write-access? dbfile)))
	 (test-records       (make-hash-table))
	 ;; need to process runconfigs before generating these lists
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; Generated by a call to (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;; Put fully qualified test/testpath names in this list to be done
         (waitors-upon       (make-hash-table)) ;; given a test, return list of tests waiting upon this test.
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 ;; (tdbdat             (tasks:open-db))
	 (config-reruns      (let ((x (configf:lookup *configdat* "setup" "reruns")))
			       (if x (string->number x) #f)))
	 (allowed-tests      #f)
	 (runconf            #f))

    ;; check if readonly
    (when readonly-mode
      (debug:print-error 0 *default-log-port* "megatest.db is readonly.  Cannot proceed.")
      (exit 1))

    ;; per user request. If less than 100Meg space on dbdir partition, bail out with error
    ;; this will reduce issues in database corruption
    (common:check-db-dir-and-exit-if-insufficient)

    ;; override the number of reruns from the configs
    ;; this needs to be done at the place where is first runs:run-tests called 
    ;(if (and config-reruns
	  ;   (> run-count config-reruns))
 	  ;(set! run-count config-reruns))

    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (let ((sighand (lambda (signum)
		     ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
		     (set! *time-to-exit* #t)
		     (print "Received signal " signum ", cleaning up before exit. Please wait...")
		     (let ((th1 (make-thread (lambda ()
					       ;; (let ((tdbdat (tasks:open-db)))
						 (rmt:tasks-set-state-given-param-key task-key "killed") ;; )
					       (print "Killed by signal " signum ". Exiting")
					       (thread-sleep! 3)
					       (exit))))
			   (th2 (make-thread (lambda ()
					       (thread-sleep! 5)
					       (debug:print 0 *default-log-port* "Done")
					       (exit 4)))))
		       (thread-start! th2)







|
|
















|

















|



|







519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
;; run-count is passed from megatest.scm as configf:lookup *configdat* "setup" "reruns", or defaults to 1.
(define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names
  (let* ((keys               (keys:config-get-fields *configdat*))
	 (keyvals            (keys:target->keyval keys target))
	 (run-id             (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour")))  ;;  test-name)))
	 ;; (deferred          '()) ;; delay running these since they have a waiton clause
	 (runconfigf         (conc  *toppath* "/runconfigs.config"))
         (mtconfig           (conc  *toppath* "/megatest.config"))
         (readonly-mode      (not (file-write-access? mtconfig)))
	 (test-records       (make-hash-table))
	 ;; need to process runconfigs before generating these lists
	 (all-tests-registry #f)  ;; (tests:get-all)) ;; (tests:get-valid-tests (make-hash-table) test-search-path)) ;; all valid tests to check waiton names
	 (all-test-names     #f)  ;; (hash-table-keys all-tests-registry))
	 (test-names         #f)  ;; Generated by a call to (tests:filter-test-names all-test-names test-patts))
	 (required-tests     #f)  ;; Put fully qualified test/testpath names in this list to be done
         (waitors-upon       (make-hash-table)) ;; given a test, return list of tests waiting upon this test.
	 (task-key           (conc (hash-table->alist flags) " " (get-host-name) " " (current-process-id)))
	 ;; (tdbdat             (tasks:open-db))
	 (config-reruns      (let ((x (configf:lookup *configdat* "setup" "reruns")))
			       (if x (string->number x) #f)))
	 (allowed-tests      #f)
	 (runconf            #f))

    ;; check if readonly
    (when readonly-mode
      (debug:print-error 0 *default-log-port* "Megatest database is readonly.  Cannot proceed.")
      (exit 1))

    ;; per user request. If less than 100Meg space on dbdir partition, bail out with error
    ;; this will reduce issues in database corruption
    (common:check-db-dir-and-exit-if-insufficient)

    ;; override the number of reruns from the configs
    ;; this needs to be done at the place where is first runs:run-tests called 
    ;(if (and config-reruns
	  ;   (> run-count config-reruns))
 	  ;(set! run-count config-reruns))

    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))

    (let ((sighand (lambda (signum)
		     ;; (signal-mask! signum) ;; to mask or not? seems to cause issues in exiting
		     (set! *time-to-exit* #t)
		     (debug:print 0 *default-log-port* "Received signal " signum ", cleaning up before exit. Please wait...")
		     (let ((th1 (make-thread (lambda ()
					       ;; (let ((tdbdat (tasks:open-db)))
						 (rmt:tasks-set-state-given-param-key task-key "killed") ;; )
					       (debug:print 0 *default-log-port* "Killed by signal " signum ". Exiting")
					       (thread-sleep! 3)
					       (exit))))
			   (th2 (make-thread (lambda ()
					       (thread-sleep! 5)
					       (debug:print 0 *default-log-port* "Done")
					       (exit 4)))))
		       (thread-start! th2)
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837

    (if (not (null? required-tests))
	(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ((keep-going        #t)
		 (run-queue-retries 5)
		 #;(th1        (make-thread (lambda ()
					    (handle-exceptions
						exn
						(begin
						  (print-call-chain)
						  (print " message: " ((condition-property-accessor 'exn 'message) exn)))
					      (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
								    (any->number reglen) all-tests-registry)))
					  "runs:run-tests-queue"))
		 (th2        (make-thread (lambda ()			 ;; BBQ: why are we visiting ALL runs here?	    
					    ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
					    (let ((run-ids (rmt:get-all-run-ids)))
					      (for-each (lambda (run-id)
							  (if keep-going
							      (handle-exceptions
							       exn
							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
							run-ids)))
					  "runs: mark-incompletes")))
	    ;; (thread-start! th1)
	    (thread-start! th2)
	    ;; (thread-join! th1)
	    ;; just do the main stuff in the main thread
	    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
								    (any->number reglen) all-tests-registry)
	    (set! keep-going #f)
	    (thread-join! th2)
	    ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD
	    (if (> run-count 0) ;; handle reruns
		(begin
		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "ABORT,STUCK/DEAD,n/a,ZERO_ITEMS"))
		  ;; recursive call to self
      (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
                  (launch:end-of-run-check run-id)))
	  (debug:print-info 0 *default-log-port* "No tests to run")))
    (debug:print-info 4 *default-log-port* "All done by here")
    ;; TODO: try putting post hook call here
      
    ;  (debug:print-info 2 *default-log-port* " run-count " run-count)
    ;  (runs:run-post-hook run-id))
    ;  (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count ))   







|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<



<
<








|
|







797
798
799
800
801
802
803
804
























805
806
807


808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824

    (if (not (null? required-tests))
	(debug:print-info 1 *default-log-port* "Adding \"" (string-intersperse required-tests " ") "\" to the run queue"))
    ;; NOTE: these are all parent tests, items are not expanded yet.
    (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
    (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
      (if (> (length (hash-table-keys test-records)) 0)
	  (let* ()
























	    ;; just do the main stuff in the main thread
	    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
								    (any->number reglen) all-tests-registry)


	    ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD
	    (if (> run-count 0) ;; handle reruns
		(begin
		  (if (not (hash-table-ref/default flags "-preclean" #f))
		      (hash-table-set! flags "-preclean" #t))
		  (if (not (hash-table-ref/default flags "-rerun" #f))
		      (hash-table-set! flags "-rerun" "ABORT,STUCK/DEAD,n/a,ZERO_ITEMS"))
		  ;; recursive call to self
                  (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
                 (launch:end-of-run-check run-id)))
	  (debug:print-info 0 *default-log-port* "No tests to run")))
    (debug:print-info 4 *default-log-port* "All done by here")
    ;; TODO: try putting post hook call here
      
    ;  (debug:print-info 2 *default-log-port* " run-count " run-count)
    ;  (runs:run-post-hook run-id))
    ;  (debug:print-info 2 *default-log-port* "Not calling post hook runcount = " run-count ))   
1285
1286
1287
1288
1289
1290
1291

1292
1293
1294
1295
1296
1297
1298
		    (not (member 'exclusive testmode)))))
      ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
      ;; we are going to reset all the counters for test retries by setting a new hash table
      ;; this means they will increment only when nothing can be run
      (set! *max-tries-hash* (make-hash-table))
      
      (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat)

      (runs:incremental-print-results run-id)
      (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
      (runs:shrink-can-run-more-tests-count runsdat)  ;; DELAY TWEAKER (still needed?)
      ;; (thread-sleep! *global-delta*)
      (if (or (not (null? tal))(not (null? reg)))
	  (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time
	  #f))







>







1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
		    (not (member 'exclusive testmode)))))
      ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
      ;; we are going to reset all the counters for test retries by setting a new hash table
      ;; this means they will increment only when nothing can be run
      (set! *max-tries-hash* (make-hash-table))
      
      (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat)
      (set! *last-test-launch* (current-seconds))
      (runs:incremental-print-results run-id)
      (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
      (runs:shrink-can-run-more-tests-count runsdat)  ;; DELAY TWEAKER (still needed?)
      ;; (thread-sleep! *global-delta*)
      (if (or (not (null? tal))(not (null? reg)))
	  (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time
	  #f))
1500
1501
1502
1503
1504
1505
1506


1507
1508
1509
1510
1511
1512
1513
;; every time though the loop increment the test/itempatt val.
;; when the min is > max-allowed and none running then force exit
;;
(define *max-tries-hash* (make-hash-table))

(define (runs:pretty-long-list lst)
   (if (> (length lst) 8)(append (take lst 3)(list "...")) lst))



;;======================================================================
;; runs:run-tests-queue is called by runs:run-tests
;;======================================================================
;;
;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
(define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in all-tests-registry)







>
>







1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
;; every time though the loop increment the test/itempatt val.
;; when the min is > max-allowed and none running then force exit
;;
(define *max-tries-hash* (make-hash-table))

(define (runs:pretty-long-list lst)
   (if (> (length lst) 8)(append (take lst 3)(list "...")) lst))

(define *last-loop-time-ms* 0)

;;======================================================================
;; runs:run-tests-queue is called by runs:run-tests
;;======================================================================
;;
;; test-records is a hash table testname:item_path => vector < testname testconfig waitons priority items-info ... >
(define (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests reglen-in all-tests-registry)
1638
1639
1640
1641
1642
1643
1644












1645
1646





1647
1648
1649
1650
1651
1652
1653
			   jobgroup:    jobgroup
			   waitons:     waitons
			   testmode:    testmode
			   newtal:      newtal
			   itemmaps:    itemmaps
			   ;; prereqs-not-met: prereqs-not-met
			   )))












	(runs:dat-regfull-set! runsdat regfull)
    





	(if (> num-running 0)
            (set! last-time-some-running (current-seconds)))

        (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
            (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
	;; (debug:print 0 *default-log-port* "max-tries-hash: " (hash-table->alist *max-tries-hash*))








>
>
>
>
>
>
>
>
>
>
>
>

|
>
>
>
>
>







1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
			   jobgroup:    jobgroup
			   waitons:     waitons
			   testmode:    testmode
			   newtal:      newtal
			   itemmaps:    itemmaps
			   ;; prereqs-not-met: prereqs-not-met
			   )))

	;; too-tight loop detection and delay, this might hide issues
	;; that occur in long run times. Consider commenting when debugging
	;; 
	(if (and (>= num-running max-concurrent-jobs)
		 (< (- (current-milliseconds) *last-loop-time-ms*) 500))
	    (begin
	      (if (runs:lownoise "too-tight-loop" 5)
		  (debug:print-info 2 *default-log-port* "Excessively fast loop, delaying 1/2 second"))
	      (thread-sleep! 0.5)))
	(set! *last-loop-time-ms* (current-milliseconds))
     
	(runs:dat-regfull-set! runsdat regfull)


	(if (> (- (current-seconds) *last-test-launch*) 5)        ;; be pretty aggressive for five seconds after
	    (runs:too-soon-delay (conc "loop delay " hed) 1 0.6)    ;; starting a test then apply more delay
	    (runs:too-soon-delay (conc "loop delay " hed) 1 0.1)) 
	
	(if (> num-running 0)
            (set! last-time-some-running (current-seconds)))

        (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
            (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
	;; (debug:print 0 *default-log-port* "max-tries-hash: " (hash-table->alist *max-tries-hash*))

1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
	 ))) ;; end loop on sorted test names
    ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched 
    (rmt:set-var (conc "lunch-complete-" run-id) "yes")  
        
    ;; now *if* -run-wait we wait for all tests to be done
    ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
    (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
    
    (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))
		    (prev-num-running 0))
      ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
      (if (and (or (args:get-arg "-run-wait")
		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
	       (> num-running 0))







|







1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
	 ))) ;; end loop on sorted test names
    ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched 
    (rmt:set-var (conc "lunch-complete-" run-id) "yes")  
        
    ;; now *if* -run-wait we wait for all tests to be done
    ;; Now wait for any RUNNING tests to complete (if in run-wait mode)
    ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
    (thread-sleep! 0.1) ;; I think there is a race condition here. Let states/statuses settle
    
    (let wait-loop ((num-running      (rmt:get-count-tests-running-for-run-id run-id))
		    (prev-num-running 0))
      ;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
      (if (and (or (args:get-arg "-run-wait")
		   (equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
	       (> num-running 0))
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
		     (begin
		       ;; wait for less than max jobs here
		       (if (runs:dat-wait-for-jobs-function runsdat)
			   ((runs:dat-wait-for-jobs-function runsdat) testdat-rec))
		       
		       (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
			   (begin
			     (print "ERROR: Failed to launch the test. Exiting as soon as possible")
			     (set! *globalexitstatus* 1) ;; 
			     (process-signal (current-process-id) signal/kill))
			   )
		       ;; wait again here?
		       ))))))
	((KILLED) 
	 (debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.")







|







2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
		     (begin
		       ;; wait for less than max jobs here
		       (if (runs:dat-wait-for-jobs-function runsdat)
			   ((runs:dat-wait-for-jobs-function runsdat) testdat-rec))
		       
		       (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
			   (begin
			     (debug:print 0 *default-log-port* "ERROR: Failed to launch the test. Exiting as soon as possible")
			     (set! *globalexitstatus* 1) ;; 
			     (process-signal (current-process-id) signal/kill))
			   )
		       ;; wait again here?
		       ))))))
	((KILLED) 
	 (debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.")
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
	 (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f)))
	 (rp-mutex     (make-mutex))
	 (bup-mutex    (make-mutex))
         (keep-records (args:get-arg "-keep-records")) ;; used in conjunction with -remove-runs to keep the records, TODO: consolidate this with "mode".
	 (test-records '())) ;; for tasks that we wish to operate on all tests in one fell swoop

    (let* ((write-access-actions '(remove-runs set-state-status archive run-wait kill-runs))
           (dbfile             (conc  *toppath* "/megatest.db"))
           (readonly-mode      (not (file-write-access? dbfile))))
      (when (and readonly-mode
                 (member action write-access-actions))
        (debug:print-error 0 *default-log-port* "megatest.db is readonly.  Cannot proceed with action ["action"] in which write-access isrequired .")
        (exit 1)))
    
    (debug:print-info 4 *default-log-port* "runs:operate-on => Header: " header " action: " action " new-state-status: " new-state-status)
    (if (> 2 (length state-status))
	(begin
	  (debug:print-error 0 *default-log-port* "the parameter to -set-state-status is a comma delimited string. E.g. COMPLETED,FAIL")
	  (exit)))







|



|







2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
	 (state-status (if (string? new-state-status) (string-split new-state-status ",") '(#f #f)))
	 (rp-mutex     (make-mutex))
	 (bup-mutex    (make-mutex))
         (keep-records (args:get-arg "-keep-records")) ;; used in conjunction with -remove-runs to keep the records, TODO: consolidate this with "mode".
	 (test-records '())) ;; for tasks that we wish to operate on all tests in one fell swoop

    (let* ((write-access-actions '(remove-runs set-state-status archive run-wait kill-runs))
           (dbfile             (conc  *toppath* "/.megatest/main.db"))
           (readonly-mode      (not (file-write-access? dbfile))))
      (when (and readonly-mode
                 (member action write-access-actions))
        (debug:print-error 0 *default-log-port* dbfile " is readonly.  Cannot proceed with action ["action"] in which write-access isrequired .")
        (exit 1)))
    
    (debug:print-info 4 *default-log-port* "runs:operate-on => Header: " header " action: " action " new-state-status: " new-state-status)
    (if (> 2 (length state-status))
	(begin
	  (debug:print-error 0 *default-log-port* "the parameter to -set-state-status is a comma delimited string. E.g. COMPLETED,FAIL")
	  (exit)))
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
	 ))
     runs)
    ;; special case - archive get
    (if (equal? (args:get-arg "-archive") "get")
	(archive:bup-get-data "get" #f #f test-records rp-mutex bup-mutex))
    (if (or (equal? (args:get-arg "-archive") "save") (equal? (args:get-arg "-archive") "save-remove"))
        (begin 
             (print "db archive started")  
             (archive:megatest-db target runnamepatt)
             (print "db archived")))
    )
  #t
  )

(define (runs:remove-test-directory test mode) ;; remove-data-only)
  (let* ((run-dir       (db:test-get-rundir test))    ;; run dir is from the link tree
	 (real-dir      (if (common:file-exists? run-dir)







|

|







2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
	 ))
     runs)
    ;; special case - archive get
    (if (equal? (args:get-arg "-archive") "get")
	(archive:bup-get-data "get" #f #f test-records rp-mutex bup-mutex))
    (if (or (equal? (args:get-arg "-archive") "save") (equal? (args:get-arg "-archive") "save-remove"))
        (begin 
             (debug:print 0 *default-log-port* "db archive started")  
             (archive:megatest-db target runnamepatt)
             (debug:print 0 *default-log-port* "db archived")))
    )
  #t
  )

(define (runs:remove-test-directory test mode) ;; remove-data-only)
  (let* ((run-dir       (db:test-get-rundir test))    ;; run dir is from the link tree
	 (real-dir      (if (common:file-exists? run-dir)
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
     (lambda (key)
       (let* ((idx (cadr key))
	      (fld (car  key))
	      (val (configf:lookup test-conf "test_meta" fld)))
	 ;; (debug:print 5 *default-log-port* "idx: " idx " fld: " fld " val: " val)
	 (if (and val (not (equal? (vector-ref currrecord idx) val)))
	     (begin
	       (print "Updating " test-name " " fld " to " val)
	       (rmt:testmeta-update-field test-name fld val)))))
     '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10)))))

;; find tests with matching tags, tagpatt is a string "tagpatt1,tagpatt2%, ..."
;;
(define (runs:get-tests-matching-tags tagpatt)
  (let* ((tagdata (rmt:get-tests-tags))







|







2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
     (lambda (key)
       (let* ((idx (cadr key))
	      (fld (car  key))
	      (val (configf:lookup test-conf "test_meta" fld)))
	 ;; (debug:print 5 *default-log-port* "idx: " idx " fld: " fld " val: " val)
	 (if (and val (not (equal? (vector-ref currrecord idx) val)))
	     (begin
	       (debug:print 0 *default-log-port* "Updating " test-name " " fld " to " val)
	       (rmt:testmeta-update-field test-name fld val)))))
     '(("author" 2)("owner" 3)("description" 4)("reviewed" 5)("tags" 9)("jobgroup" 10)))))

;; find tests with matching tags, tagpatt is a string "tagpatt1,tagpatt2%, ..."
;;
(define (runs:get-tests-matching-tags tagpatt)
  (let* ((tagdata (rmt:get-tests-tags))