Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -817,14 +817,14 @@
 
 (define *common:ended-states*       ;; states which indicate the test is stopped and will not proceed
   '("COMPLETED" "ARCHIVED" "KILLED" "KILLREQ" "STUCK" "INCOMPLETE" ))
 
 (define *common:badly-ended-states* ;; these roll up as CHECK, i.e. results need to be checked
-  '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD"))
+  '("KILLED" "KILLREQ" "STUCK" "INCOMPLETE" "DEAD" "CHECK"))
 
 (define *common:well-ended-states* ;; an item's prereq in this state allows item to proceed
-  '("PASS" "WARN" "CHECK" "WAIVED" "SKIP"))
+  '("PASS" "WARN" "WAIVED" "SKIP"))
 
 ;; BBnote: *common:running-states* used from db:set-state-status-and-roll-up-items
 (define *common:running-states*     ;; test is either running or can be run
   '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED" "STARTED"))
 
@@ -2208,37 +2208,31 @@
 ;;
 (define (common:wait-for-cpuload maxnormload numcpus-in
 				 #!key (count 1000)
 				 (msg #f)(remote-host #f)(num-tries 5))
   (let* ((loadavg     (common:get-cpu-load remote-host))
-	  ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
-	 (numcpus     (if (<= 1 numcpus-in)
-			  (common:get-num-cpus remote-host)
-			  numcpus-in))
-	 (first       (car loadavg))
-	 (next        (cadr loadavg))
-	 (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug
-						   ;; where numcpus
-						   ;; (or could be
-						   ;; maxload) is
-						   ;; zero, crude
-						   ;; fallback is to
-						   ;; at least use 1
-	 ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
-	 ;; etc.
-	 (effective-load    (common:get-intercept first next))
-	 (recommended-delay (common:get-delay effective-load numcpus))
-	 (effective-host    (or remote-host "localhost"))
-	 (normalized-effective-load (/ effective-load numcpus))
-	 (will-wait                 (> normalized-effective-load maxnormload)))
-    (if (> recommended-delay 1) 
-	(let* ((actual-delay (min recommended-delay 30)))
-	  (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
-	      (debug:print-info 0 *default-log-port* "Load control, delaying "
+    ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
+    (numcpus     (if (<= 1 numcpus-in)
+    (common:get-num-cpus remote-host) numcpus-in))
+    (first       (car loadavg))
+    (next        (cadr loadavg))
+    (adjmaxload  (* maxnormload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude
+						  ;; fallback is to at least use 1
+    ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
+    ;; etc.
+    (effective-load    (common:get-intercept first next))
+    (recommended-delay (common:get-delay effective-load numcpus))
+    (effective-host    (or remote-host "localhost"))
+    (normalized-effective-load (/ effective-load numcpus))
+    (will-wait                 (> normalized-effective-load maxnormload)))
+    (if (and will-wait (> recommended-delay 1)) 
+      (let* ((actual-delay (min recommended-delay 30)))
+        (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
+          (debug:print-info 0 *default-log-port* "Load control, delaying "
 				actual-delay " seconds to maintain safe load. current normalized effective load is "
 				normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load))
-	  (thread-sleep! actual-delay)))
+	(thread-sleep! actual-delay)))
     
     (cond
      ;; bad data, try again to get the data
      ((not will-wait)
       (if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))

Index: configf.scm
==================================================================
--- configf.scm
+++ configf.scm
@@ -52,10 +52,14 @@
   (let ((newalist (filter (lambda (x)(not (equal? key (car x)))) alist)))
     (append newalist (list (if metadata
 			       (list key val metadata)
 			       (list key val))))))
 
+;; this is used in megatestqa/ext.scm.
+;; remove it from here and there by 12/31/21
+(define config:assoc-safe-add configf:assoc-safe-add)
+
 (define (configf:section-var-set! cfgdat section-name var value #!key (metadata #f))
   (hash-table-set! cfgdat section-name
 		   (configf:assoc-safe-add
 		    (hash-table-ref/default cfgdat section-name '())
 		    var value metadata: metadata)))
@@ -144,11 +148,11 @@
 		     (set! result (conc "#{(" cmdtype ") "  cmd "}"))))
 		(case cmdsym
 		  ((system shell scheme)
 		   (let ((delta (- (current-seconds) start-time)))
 		     (if (> delta 2)
-			 (debug:print-info 0 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)
+			 (debug:print-info 2 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)
 			 (debug:print-info 9 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)))))
 		(loop (conc prestr result poststr)))
 	      res))
 	res)))
 
@@ -531,12 +535,12 @@
 (define configf:read-file read-config)
 
 ;; safely look up a value that is expected to be a number, return
 ;; a default (#f unless provided)
 ;;
-(define (configf:lookup-number cfdat section varname #!key (default #f))
-  (let* ((val (configf:lookup *configdat* section varname))
+(define (configf:lookup-number cfgdat section varname #!key (default #f))
+  (let* ((val (configf:lookup cfgdat section varname))
          (res (if val
                   (string->number (string-substitute "\\s+" "" val #t))
                   #f)))
     (cond
      (res  res)

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -4659,11 +4659,11 @@
 
 
 
 ;; the new prereqs calculation, looks also at itempath if specified
 ;; all prereqs must be met
-;;    if prereq test with itempath='' is COMPLETED and PASS, WARN, CHECK, or WAIVED then prereq is met
+;;    if prereq test with itempath='' is in common:well-ended-states, then prereq is met
 ;;    if prereq test with itempath=ref-item-path and COMPLETED with PASS, WARN, CHECK, or WAIVED then prereq is met
 ;;
 ;; Note: mode 'normal means that tests must be COMPLETED and ok (i.e. PASS, WARN, CHECK, SKIP or WAIVED)
 ;;       mode 'toplevel means that tests must be COMPLETED only
 ;;       mode 'itemmatch or 'itemwait means that tests items must be COMPLETED and (PASS|WARN|WAIVED|CHECK) [[ NB// NOT IMPLEMENTED YET ]]
@@ -4674,10 +4674,11 @@
 ;;    2. any NOT completed and good? if yes => return those as prereqs not met, if no => return null list
 ;; 
 ;; (define (db:get-prereqs-not-met dbstruct run-id waitons ref-item-path mode)
 (define (db:get-prereqs-not-met dbstruct run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmap #f))
   ;; BBnote - rollup of an itemized test's overall state/status done in db:set-state-status-and-roll-up-items
+  (debug:print 4 *default-log-port* "db:get-prereqs-not-met: " waitons)
   (append
    (if (member 'exclusive mode)
        (let ((running-tests (db:get-tests-for-run dbstruct
 						  #f  ;; run-id of #f means for all runs. 
 						  (if (string=? ref-item-path "")   ;; testpatt
@@ -4700,10 +4701,12 @@
 	;;	    (conc (db:test-get-testname testdat)
 	;;		  "/"
 	;;		  (db:test-get-item-path testdat))))
 	 running-tests) ;; calling functions want the entire data
        '())
+
+
 
    ;; collection of: for each waiton -
    ;;   if this ref-test-name is an item in an itemized test and mode is itemwait/itemmatch:
    ;;     if waiton is not itemized - if waiton is not both completed and in ok status, add as unmet prerequisite
    ;;     if waiton is itemized:

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 ;; (declare (unit megatest-version))
 
-(define megatest-version 1.6584)
+(define megatest-version 1.6586)

Index: mtut.scm
==================================================================
--- mtut.scm
+++ mtut.scm
@@ -154,10 +154,11 @@
    show [areas|contours... ] : show areas, contours or other section from megatest.config
    gendot                    : generate a graphviz dot file from pkts.
 
 Contour actions:
    process                   : runs import, rungen and dispatch 
+   go                        : runs import, rungen and dispatch every five minutes forever
 			     
 Trigger propagation actions:
    tsend a=b,c=d...          : send trigger info to all recpients in the [listeners] section
    tlisten -port N           : listen for trigger info on port N
 			     
@@ -773,11 +774,11 @@
 				(begin
 				  (print-call-chain)
 				  (print "FAILED TO RUN RUNNAME MAPPER " callname " FOR AREA " area)
 				  (print " message: " ((condition-property-accessor 'exn 'message) exn))
 				  runname)
-			      (print "(mapper " (string-intersperse (list runkey runname area area-path reason contour mode-patt) ", ") ")")
+			      (print "(mapper " (string-intersperse (map conc (list runkey runname area area-path reason contour mode-patt)) ", ") ")")
 			      (mapper runkey runname area area-path reason contour mode-patt))
 			    (case callname
 			      ((auto #f) runname)
 			      (else   runtrans)))))
 	 (new-target     target) ;; I believe we will want target manipulation here .. (map-targets xlatr-key runkey area contour))
@@ -1612,23 +1613,70 @@
 	 ;;  (hash-table-keys adjargs))
 	 (let-values (((uuid pkt)
 		       (command-line->pkt *action* adjargs #f area-path: area-path new-ss: new-ss)))
            (print "run log @ " (conc (current-directory) "/" uuid "-" *action* ".log"))
 	   (write-pkt pktsdir uuid pkt))))
-      ((dispatch import rungen process)
+      ((dispatch import rungen process go)
        (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir")))
 	      (mtconf    (car mtconfdat))
-	      (toppath   (configf:lookup mtconf "scratchdat" "toppath")))
+	      (toppath   (configf:lookup mtconf "scratchdat" "toppath"))
+	      (period    (configf:lookup-number mtconf "mtutil" "autorun-period" default: 300))
+	      (rest-time (configf:lookup-number mtconf "mtutil" "autorun-rest"   default: 30)))
+	 (print "Using period="period" and rest time="rest-time)
 	 (case (string->symbol *action*)
 	   ((process)  (begin
 			 (common:load-pkts-to-db mtconf)
 			 (generate-run-pkts mtconf toppath)
 			 (common:load-pkts-to-db mtconf)
 			 (dispatch-commands mtconf toppath)))
 	   ((import)   (common:load-pkts-to-db mtconf)) ;; import pkts
 	   ((rungen)   (generate-run-pkts mtconf toppath))
-	   ((dispatch) (dispatch-commands mtconf toppath)))))
+	   ((dispatch) (dispatch-commands mtconf toppath))
+	   ;; [mtutil]
+	   ;; # approximate interval between run processing in mtutil (seconds)
+	   ;; autorun-period 300
+	   ;; # minimal rest period between processing 
+	   ;; autorun-rest   30
+	   ((go)
+	    ;; determine if I'm the boss
+	    (if (file-exists? "mtutil-go.pid")
+		(begin
+		  (print "ERROR: mtutil go is already running under host and pid " (with-input-from-file "mtutil-go.pid" read-line)
+			 ". Please kill that process and remove the file \"mutil-go.pid\" and try again.")
+		  (exit)))
+	    (with-output-to-file "mtutil-go.pid" (lambda ()(print (get-host-name) " " (current-process-id))))
+	    (print "Starting long running import, rungen, and process loop")
+	    (if (file-exists? "do-not-run-mtutil-go")
+		(begin
+		  (print "NOTE: Removing flag file "(current-directory)"/do-not-run-mtutil-go")
+		  (delete-file* "do-not-run-mtutil-go")))
+	    (let loop ((last-run (- (current-seconds) (+ period 10))) ;; fake out first time in 
+		       (this-run (current-seconds)))
+	      (if (file-exists? "do-not-run-mtutil-go")
+		  (begin
+		    (print "File do-not-run-mtutil-go exists, exiting.")
+		    (delete-file* "mtutil-go.pid")
+		    (exit)))
+	      (let ((delta (- this-run last-run)))
+		(if (>= delta period)
+		    (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir")))
+			   (mtconf    (car mtconfdat)))
+		      (print "Running import at " (current-seconds))
+		      (common:load-pkts-to-db mtconf)
+		      (print "Running generate run pkts at " (current-seconds))
+		      (generate-run-pkts mtconf toppath)
+		      (print "Running run dispatch at " (current-seconds))
+		      (common:load-pkts-to-db mtconf)
+		      (dispatch-commands mtconf toppath)
+		      (print "Done running import, generate, and dispatch done in " (- (current-seconds) this-run))
+		      (print "NOTE: touch " (current-directory) "/do-not-run-mtutil-go to kill this runner.")
+		      (loop this-run (current-seconds)))
+		    (let ((now (current-seconds)))
+		      (print "Sleeping " rest-time " seconds, next run in aproximately " (- period (- now last-run)) " seconds")
+		      (thread-sleep! rest-time)
+		      (loop last-run (current-seconds))))))
+	    (delete-file* "mtutil-go.pid")))))
       ;; misc
       ((show)
        (if (> (length remargs) 0)
 	   (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir")))
 		  (mtconf    (car mtconfdat))
@@ -1807,46 +1855,52 @@
                                   )
 
                                 )))
                            (loop (nn-recv rep))))
 		       (print "ERROR: Port " portnum " already in use. Try another port")))))))
-      
-
-
-
-      ((tlisten)
-       (if (null? remargs)
-           (print "ERROR: useage for tlisten is \"mtutil tlisten portnum\"")
-           (let ((portnum (string->number (car remargs))))
-              
-             (if (not portnum)
-                 (print "ERROR: the portnumber parameter must be a number, you gave: " (car remargs))
-                 (begin
-                   (if (not (is-port-in-use portnum))  
-                       (let* ((rep       (start-nn-server portnum))
-                              (mtconfdat (simple-setup (args:get-arg "-start-dir")))
-                              (mtconf    (car mtconfdat))
-                              (contact   (configf:lookup mtconf "listener" "owner"))
-                              (script    (configf:lookup mtconf "listener" "script")))
-                         (print "Listening on port " portnum " for messages.")
-                         (set-signal-handler! signal/int  (lambda (signum) 
-															(set! *time-to-exit* #t)
-  														(debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting !!")
-  														(let ((email-body (mtut:stml->string (s:body
-																						(s:p (conc "Received signal " signum ". Lister has been terminated on host " (get-environment-variable "HOST") ". "))))))
-             					        (sendmail contact "Listner has been terminated." email-body  use_html: #t))
-                              (exit)))
-															(set-signal-handler! signal/term  (lambda (signum) 
-															(set! *time-to-exit* #t)
-  														(debug:print-error 0 *default-log-port* "Received signal " signum " sending email befor exiting !!")
-  														(let ((email-body (mtut:stml->string (s:body
-																						(s:p (conc "Received signal " signum ". Lister has been terminated on host " (get-environment-variable "HOST") ". "))))))
-             					        (sendmail contact "Listner has been terminated." email-body  use_html: #t))
-                              (exit)))
-
-                         ;(set-signal-handler! signal/term special-signal-handler)
-                         
+
+    ((tlisten)
+     (if (null? remargs)
+         (print "ERROR: useage for tlisten is \"mtutil tlisten portnum\"")
+         (let ((portnum (string->number (car remargs))))
+           
+           (if (not portnum)
+               (print "ERROR: the portnumber parameter must be a number, you gave: " (car remargs))
+               (begin
+                 (if (not (is-port-in-use portnum))  
+                     (let* ((rep       (start-nn-server portnum))
+                            (mtconfdat (simple-setup (args:get-arg "-start-dir")))
+                            (mtconf    (car mtconfdat))
+                            (contact   (configf:lookup mtconf "listener" "owner"))
+                            (script    (configf:lookup mtconf "listener" "script")))
+                       (print "Listening on port " portnum " for messages.")
+                       (set-signal-handler! signal/int
+					    (lambda (signum) 
+					      (set! *time-to-exit* #t)
+  					      (debug:print-error 0 *default-log-port* "Received signal " signum
+								 " sending email befor exiting !!")
+  					      (let ((email-body (mtut:stml->string
+								 (s:body
+								  (s:p (conc "Received signal " signum
+									     ". Lister has been terminated on host "
+									     (get-environment-variable "HOST") ". "))))))
+             					(sendmail contact "Listner has been terminated." email-body  use_html: #t))
+					      (exit)))
+		       (set-signal-handler! signal/term  (lambda (signum) 
+							   (set! *time-to-exit* #t)
+  							   (debug:print-error 0 *default-log-port* "Received signal "
+									      signum " sending email befor exiting !!")
+  							   (let ((email-body (mtut:stml->string
+									      (s:body
+									       (s:p (conc "Received signal " signum
+											  ". Lister has been terminated on host "
+											  (get-environment-variable "HOST") ". "))))))
+             					             (sendmail contact "Listner has been terminated." email-body  use_html: #t))
+							   (exit)))
+
+		       ;; (set-signal-handler! signal/term special-signal-handler)
+                       
                          (let loop ((instr (nn-recv rep)))
                              (nn-send rep "ok")
                              (let ((ctime (date->string (current-date)))) 
                              (if  (equal? instr "time-to-die")
                               (begin 

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -501,10 +501,11 @@
 ;;
 ;;  test-names: Comma separated patterns same as test-patts but used in selection 
 ;;              of tests to run. The item portions are not respected.
 ;;              FIXME: error out if /patt specified
 ;;            
+;; run-count is passed from megatest.scm as configf:lookup *configdat* "setup" "reruns", or defaults to 1.
 (define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names
   (let* ((keys               (keys:config-get-fields *configdat*))
 	 (keyvals            (keys:target->keyval keys target))
 	 (run-id             (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour")))  ;;  test-name)))
 	 ;; (deferred          '()) ;; delay running these since they have a waiton clause
@@ -624,11 +625,11 @@
     ;; look up all tests matching the comma separated list of globs in
     ;; test-patts (using % as wildcard)
 
     ;; (set! test-names (delete-duplicates (tests:get-valid-tests *toppath* test-patts)))
     (debug:print-info 0 *default-log-port* "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " "))
-    (debug:print-info 0 *default-log-port* "all tests:         " (string-intersperse (sort all-test-names string<) " "))
+    (debug:print-info 2 *default-log-port* "all tests:         " (string-intersperse (sort all-test-names string<) " "))
     (debug:print-info 0 *default-log-port* "test names:        " (string-intersperse (sort test-names string<) " "))
     (debug:print-info 0 *default-log-port* "required tests:    " (string-intersperse (sort required-tests string<) " "))
 
     ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if
     ;; -keepgoing is specified
@@ -738,11 +739,11 @@
 			  (waiton-itemized (and waiton-tconfig
 						(or (hash-table-ref/default waiton-tconfig "items" #f)
 						    (hash-table-ref/default waiton-tconfig "itemstable" #f))))
 			  (itemmaps        (tests:get-itemmaps config))  ;; (configf:lookup config "requirements" "itemmap"))
 			  (new-test-patts  (tests:extend-test-patts test-patts hed waiton itemmaps hed-itemized-waiton))) 
-		     (debug:print-info 0 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items")
+		     (debug:print-info 2 *default-log-port* "Test " waiton " has " (if waiton-record "a" "no") " waiton-record and" (if waiton-itemized " " " no ") "items")
 		     ;; need to account for test-patt here, if I am test "a", selected with a test-patt of "hed/b%"
 		     ;; and we are waiting on "waiton" we need to add "waiton/,waiton/b%" to test-patt
 		     ;; is this satisfied by merely appending "/" to the waiton name added to the list?
 		     ;;
 		     ;; This approach causes all of the items in an upstream test to be run 
@@ -760,11 +761,11 @@
                                    (set! test-patts new-test-patts))
                                  (begin
                                    (debug:print-info 0 *default-log-port* "Waitor(s) not yet on testpatt for " waiton ", setting up to re-process it")
                                    (set! tal (append (cons waiton tal)(list hed)))))
                              (begin
-                               (debug:print-info 0 *default-log-port* "Adding non-itemized test " waiton " to required-tests")
+                               (debug:print-info 2 *default-log-port* "Adding non-itemized test " waiton " to required-tests")
                                (set! required-tests (cons waiton required-tests))
                                (set! test-patts new-test-patts)))
 			 (begin
 			   (debug:print-info 0 *default-log-port* "No testconfig info yet for " waiton ", setting up to re-process it")
 			   (set! tal (append (cons waiton tal)(list hed))))) ;; (cons (conc waiton "/") required-tests))
@@ -897,10 +898,11 @@
 ;;	         (tal         (cdr sorted-test-names))
 ;;	         (reg         '()) ;; registered, put these at the head of tal 
 ;;	         (reruns      '()))
 (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)
   (let* ((loop-list       (list hed tal reg reruns))
+         (junk (debug:print-info 4 *default-log-port* "expand-items calling rmt:get-prereqs-not-met"))
 	 (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
 			    (if (list? res)
 				res
 				(begin
 				  (debug:print 0 *default-log-port*
@@ -1090,11 +1092,11 @@
      ((null? runnables)
       (debug:print-info 4 *default-log-port* "cond branch - "  "ei-7")
       #f) ;; if we get here and non-completed is null then it is all over.
      (else
       (debug:print-info 4 *default-log-port* "cond branch - "  "ei-8")
-      (debug:print 0 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now")
+      (debug:print 2 *default-log-port* "WARNING: FAILS or incomplete tests maybe preventing completion of this run. Watch for issues with test " hed ", continuing for now")
       (list (car newtal)(cdr newtal) reg reruns)))))
 
 (define (runs:mixed-list-testname-and-testrec->list-of-strings inlst)
   (if (null? inlst)
       '()
@@ -1667,11 +1669,10 @@
 		  (loop (runs:queue-next-hed tal reg reglen regfull)
 			(runs:queue-next-tal tal reg reglen regfull)
 			(runs:queue-next-reg tal reg reglen regfull)
 			reruns))))
         ;; (loop (car tal)(cdr tal) reg reruns))))
-
 	(runs:incremental-print-results run-id)
 	(debug:print 4 *default-log-port* "TOP OF LOOP => "
 		     "test-name: " test-name
 		     "\n  hed:         " hed
 		     "\n  tal:         " (runs:pretty-long-list tal)
@@ -1851,16 +1852,18 @@
                       (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
                       )
                   )
 		;; if can't run more just loop with next possible test
 		(loop (car newtal)(cdr newtal) reg reruns))))
-         
+        
+
 	 ;; this case should not happen, added to help catch any bugs
 	 ((and (list? items) itemdat)
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-5")
 	  (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this")
 	  (exit 1))
+
 	 ((not (null? reruns))
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-6")
 	  (let* ((newlst (tests:filter-non-runnable run-id tal test-records)) ;; i.e. not FAIL, WAIVED, INCOMPLETE, PASS, KILLED,
 		 (junked (lset-difference equal? tal newlst)))
 	    (debug:print-info 4 *default-log-port* "full drop through, if reruns is less than 100 we will force retry them, reruns=" reruns ", tal=" tal)
@@ -1869,10 +1872,11 @@
 	    (set! num-retries (+ num-retries 1))
 	    ;; (thread-sleep! (+ 1 *global-delta*))
 	    (if (not (null? newlst))
 		;; since reruns have been tacked on to newlst create new reruns from junked
 		(loop (car newlst)(cdr newlst) reg (delete-duplicates junked)))))
+
 	 ((not (null? tal))
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-7")
 	  (debug:print-info 4 *default-log-port* "I'm pretty sure I shouldn't get here."))
 	 ((not (null? reg)) ;; could we get here with leftovers?
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-8")
@@ -2108,11 +2112,11 @@
 	    (else (set! runflag #f)))
 	   (debug:print 4 *default-log-port* "RUNNING => runflag: " runflag " STATE: " (test:get-state testdat) " STATUS: " (test:get-status testdat))
 	   (if (not runflag)
 	       (if (not parent-test)
 		   (if (runs:lownoise (conc "not starting test" full-test-name) 60)
-		       (debug:print 1 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) 
+		       (debug:print 3 *default-log-port* "NOTE: Not starting test " full-test-name " as it is state \"" (test:get-state testdat) 
 				    "\" and status \"" (test:get-status testdat) "\", use -rerun \"" (test:get-status testdat)
 				    "\" or -force to override")))
 	       ;; NOTE: No longer be checking prerequisites here! Will never get here unless prereqs are
 	       ;;       already met.
 	       ;; This would be a great place to do the process-fork
@@ -2393,11 +2397,11 @@
 		    (tasks:kill-runner target run-name testpatt)
 		    ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
 		    (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
 		   ((set-state-status)
 		    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
-		    (debug:print 1 *default-log-port* "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
+		    (debug:print 2 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
 		   ((print-run)
 		    (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
 		    action)
 		   ((run-wait)
 		    (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete"))
@@ -2534,11 +2538,11 @@
                                     ) ; end let
                                   ); end cond has-subrun
 
                                  (else
                                   ;; BB - TODO - consider backgrounding to threads to delete tests (work below) 
-                                  (debug:print-info 0 *default-log-port* "test: " test-name " itest-state: " test-state)
+                                  (debug:print-info 2 *default-log-port* "test: " test-name " itest-state: " test-state)
                                   (if (member test-state (list "RUNNING" "LAUNCHED" "REMOTEHOSTSTART" "KILLREQ"))
                                       (begin
                                         (if (not (hash-table-ref/default test-retry-time test-fulln #f))
                                             (begin
                                               ;; want to set to REMOVING BUT CANNOT do it here?
@@ -2729,11 +2733,11 @@
 	 )
     (case clean-mode
       ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f))
       ((remove-all)      (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f))
       ((archive-remove)  (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f)))
-    (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
+    (debug:print-info 2 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir)
     (if (and real-dir 
 	     (> (string-length real-dir) 5)
 	     (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc.
 	(let* ((realpath (resolve-pathname run-dir)))
 	  (debug:print-info 1 *default-log-port* "Recursively removing " realpath)
@@ -3057,17 +3061,17 @@
 		 (files    (if (common:file-exists? runtop)
 			       (append (glob (conc runtop "/.megatest*"))
 				       (glob (conc runtop "/.runconfig*")))
 			       '())))
 	    (if (null? files)
-		(debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.")
+		(debug:print-info 2 *default-log-port* "No cached megatest or runconfigs files found. None removed.")
 		(begin
-		  (debug:print-info 0 *default-log-port* "Removing cached files:\n    " (string-intersperse files "\n    "))
+		  (debug:print-info 2 *default-log-port* "Removing cached files:\n    " (string-intersperse files "\n    "))
 		  (for-each 
 		   (lambda (f)
 		     (handle-exceptions
 			 exn
 			 (debug:print 0 *default-log-port* "WARNING: Failed to remove file " f ", exn=" exn)
 		       (delete-file f)))
 		   files))))
 	  (debug:print-error 0 *default-log-port* "-clean-cache requires -runname."))
       (debug:print-error 0 *default-log-port* "-clean-cache requires -target or -reqtarg")))

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -111,12 +111,12 @@
     ((fs)   result)
     (else 
      (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
      result)))
 
-;; Given a run id start a server process    ### NOTE ### > file 2>&1 
-;; if the run-id is zero and the target-host is set 
+;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
+;; if the target-host is set 
 ;; try running on that host
 ;;   incidental: rotate logs in logs/ dir.
 ;;
 (define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
   (let* ((curr-host   (get-host-name))
@@ -154,21 +154,22 @@
 	(begin
 	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
 	  (setenv "TARGETHOST" target-host)))
       
     (setenv "TARGETHOST_LOGF" logfile)
-    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
-    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
-    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
+    (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
+    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
     (system (conc "nbfake " cmdln))
     (unsetenv "TARGETHOST_LOGF")
     (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
     (thread-join! log-rotate)
     (pop-directory)))
 
-;; given a path to a server log return: host port startseconds
-;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 
+;; given a path to a server log return: host port startseconds server-id
+;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
+;; example of what it's looking for in the log file:
+;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
 
 (define (server:logf-get-start-info logf)
   (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
         (dbprep-rx (regexp "^SERVER: dbprep"))
         (dbprep-found 0)) 
@@ -201,18 +202,18 @@
 			      (string->number (caddr dat))
                               (cadr (cddr dat))))))
                 (begin 
                    (if dbprep-found
                       (begin
-                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
-                         (thread-sleep! 25)
+                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (common:human-time))
+                         (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting?
                       )
-                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))
+                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))
                    )
 		    (list #f #f #f #f)))))))))
 
-;; get a list of servers with all relevant data
+;; get a list of servers from the log files, with all relevant data
 ;; ( mod-time host port start-time pid )
 ;;
 (define (server:get-list areapath #!key (limit #f))
   (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
 	(day-seconds (* 24 60 60)))
@@ -234,11 +235,11 @@
 	(let* ((server-logs-cmd  (conc "grep -iL exiting " areapath "/logs/server-*-*.log"))
                (server-logs   (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all))))
 	       (num-serv-logs (length server-logs)))
 	  (if (or (null? server-logs) (= num-serv-logs 0))
               (let ()
-                 (debug:print 1  *default-log-port* "There are no servers running")
+                 (debug:print 1  *default-log-port* "There are no servers running at " (common:human-time))
 	         '()
               )
 	      (let loop ((hed  (string-chomp (car server-logs)))
 			 (tal  (cdr server-logs))
 			 (res '()))
@@ -365,66 +366,70 @@
   (if *my-client-signature* *my-client-signature*
       (let ((sig (server:mk-signature)))
         (set! *my-client-signature* sig)
         *my-client-signature*)))
 
-;; wait for server=start-last to be three seconds old
+
+;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
+;; if it is old enough, overwrite it and wait 0.25 seconds.
+;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
 ;;
 (define (server:wait-for-server-start-last-flag areapath)
   (let* ((start-flag (conc areapath "/logs/server-start-last"))
 	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
-	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
+	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
 	 (server-key (conc (get-host-name) "-" (current-process-id))))
     (if (file-exists? start-flag)
 	(let* ((fmodtime (file-modification-time start-flag))
 	       (delta    (- (current-seconds) fmodtime))
-	       (all-go   (> delta reftime)))
-	  (if (and all-go
+	       (old-enough   (> delta idletime))
+               (new-server-key "")
+              )
+
+          ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
+	  ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
+           (if (and old-enough
 		   (begin
                      (debug:print-info 0 *default-log-port* "Writing " start-flag)
-		     (with-output-to-file start-flag
-		       (lambda ()
-			 (print server-key)))
+		     (with-output-to-file start-flag (lambda () (print server-key)))
 		     (thread-sleep! 0.25)
-		     (let ((res (with-input-from-file start-flag
-				  (lambda ()
-				    (read-line)))))
-		       (equal? server-key res))))
-	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
+		     (set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))
+		     (equal? server-key new-server-key))
+                )
+	      #t
+
+           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time> + 1, then call this function recursively. 
 	      (begin
 		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
-				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
-		(thread-sleep! reftime)
+				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
+
+		(thread-sleep! ( + 1 idletime))
 		(server:wait-for-server-start-last-flag areapath)))))))
 
-;; kind start up of servers, wait 40 seconds before allowing another server for a given
-;; run-id to be launched
+
+        
+;; kind start up of server, wait before allowing another server for a given
+;; area to be launched
 ;;
 (define (server:kind-run areapath)
   ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
-  ;; and wait for it to be at least 3 seconds old
+  ;; and wait for it to be at least <server idletime> seconds old
   (server:wait-for-server-start-last-flag areapath)
   (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
-      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
-	     (call-num     (car last-run-dat))
-	     (when-run     (cadr last-run-dat))
-	     (run-delay    (+ (case call-num
-				((0)    0)
-				((1)   20)
-				((2)  300)
-				(else 600))
-			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
+      (let* (
 	     (lock-file    (conc areapath "/logs/server-start.lock")))
-	(if	(> (- (current-seconds) when-run) run-delay)
-		(let* ((start-flag (conc areapath "/logs/server-start-last")))
-		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
-                  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
-		  (system (conc "touch " start-flag)) ;; lazy but safe
-		  (server:run areapath)
-		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
-		  (common:simple-file-release-lock lock-file)))
-	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
+	(let* ((start-flag (conc areapath "/logs/server-start-last")))
+	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
+	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
+	  (system (conc "touch " start-flag)) ;; lazy but safe
+	  (server:run areapath)
+	  (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
+	  (common:simple-file-release-lock lock-file)))
+
+      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
+   )
+)
 
 ;; this one seems to be the general entry point
 ;;
 (define (server:start-and-wait areapath #!key (timeout 60))
   (let ((give-up-time (+ (current-seconds) timeout)))

Index: utils/mk_wrapper
==================================================================
--- utils/mk_wrapper
+++ utils/mk_wrapper
@@ -44,10 +44,19 @@
 else
   export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$libdir
 fi
 
 export MT_SQLITE3_EXE=$sqlite3_exe
+
+http_vars="http_proxy https_proxy HTTP_PROXY HTTPS_PROXY"
+for i in \$http_vars
+do
+j=\${!i}
+if [ "\$j" != "" ]; then
+   unset \$i
+fi
+done
 __EOF
 ) > $cfgfile
   echo 
 else
   echo "INFO: LD_LIBRARY_PATH not set" >&2
@@ -93,11 +102,9 @@
 
 # echo "#!/bin/bash" > $target
 # echo "exec $prefix/bin/.\$(lsb_release -sr)/$cmd \"\$@\"" >> $target
 
 echo "lsbr=\$(lsb_release -sr)" >> $target
-if [ "$LD_LIBRARY_PATH" != "" ];then
-  echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target
-fi
+echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target
 
 # echo "if [[ -e \$lsbr ]];then source \$lsbr;fi" >> $target
 echo "exec $prefix/bin/.\$lsbr/$cmd \"\$@\"" >> $target