Index: configf.scm
==================================================================
--- configf.scm
+++ configf.scm
@@ -52,10 +52,14 @@
   (let ((newalist (filter (lambda (x)(not (equal? key (car x)))) alist)))
     (append newalist (list (if metadata
 			       (list key val metadata)
 			       (list key val))))))
 
+;; this is used in megatestqa/ext.scm.
+;; remove it from here and there by 12/31/21
+(define config:assoc-safe-add configf:assoc-safe-add)
+
 (define (configf:section-var-set! cfgdat section-name var value #!key (metadata #f))
   (hash-table-set! cfgdat section-name
 		   (configf:assoc-safe-add
 		    (hash-table-ref/default cfgdat section-name '())
 		    var value metadata: metadata)))
@@ -144,11 +148,11 @@
 		     (set! result (conc "#{(" cmdtype ") "  cmd "}"))))
 		(case cmdsym
 		  ((system shell scheme)
 		   (let ((delta (- (current-seconds) start-time)))
 		     (if (> delta 2)
-			 (debug:print-info 0 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)
+			 (debug:print-info 2 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)
 			 (debug:print-info 9 *default-log-port* "for line \"" l "\"\n command:  " cmd " took " delta " seconds to run with output:\n   " result)))))
 		(loop (conc prestr result poststr)))
 	      res))
 	res)))
 

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -501,10 +501,11 @@
 ;;
 ;;  test-names: Comma separated patterns same as test-patts but used in selection 
 ;;              of tests to run. The item portions are not respected.
 ;;              FIXME: error out if /patt specified
 ;;            
+;; run-count is passed from megatest.scm as configf:lookup *configdat* "setup" "reruns", or defaults to 1.
 (define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names
   (let* ((keys               (keys:config-get-fields *configdat*))
 	 (keyvals            (keys:target->keyval keys target))
 	 (run-id             (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour")))  ;;  test-name)))
 	 ;; (deferred          '()) ;; delay running these since they have a waiton clause
@@ -624,11 +625,11 @@
     ;; look up all tests matching the comma separated list of globs in
     ;; test-patts (using % as wildcard)
 
     ;; (set! test-names (delete-duplicates (tests:get-valid-tests *toppath* test-patts)))
     (debug:print-info 0 *default-log-port* "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " "))
-    (debug:print-info 0 *default-log-port* "all tests:         " (string-intersperse (sort all-test-names string<) " "))
+    (debug:print-info 2 *default-log-port* "all tests:         " (string-intersperse (sort all-test-names string<) " "))
     (debug:print-info 0 *default-log-port* "test names:        " (string-intersperse (sort test-names string<) " "))
     (debug:print-info 0 *default-log-port* "required tests:    " (string-intersperse (sort required-tests string<) " "))
 
     ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if
     ;; -keepgoing is specified
@@ -2393,11 +2394,11 @@
 		    (tasks:kill-runner target run-name testpatt)
 		    ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
 		    (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname")))
 		   ((set-state-status)
 		    ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
-		    (debug:print 1 *default-log-port* "Modifying state and staus for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
+		    (debug:print 1 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname")))
 		   ((print-run)
 		    (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header)
 		    action)
 		   ((run-wait)
 		    (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete"))

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -111,12 +111,12 @@
     ((fs)   result)
     (else 
      (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
      result)))
 
-;; Given a run id start a server process    ### NOTE ### > file 2>&1 
-;; if the run-id is zero and the target-host is set 
+;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
+;; if the target-host is set 
 ;; try running on that host
 ;;   incidental: rotate logs in logs/ dir.
 ;;
 (define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
   (let* ((curr-host   (get-host-name))
@@ -155,20 +155,21 @@
 	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
 	  (setenv "TARGETHOST" target-host)))
       
     (setenv "TARGETHOST_LOGF" logfile)
     (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
-    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
-    #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
+    (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
     (system (conc "nbfake " cmdln))
     (unsetenv "TARGETHOST_LOGF")
     (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
     (thread-join! log-rotate)
     (pop-directory)))
 
-;; given a path to a server log return: host port startseconds
-;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 
+;; given a path to a server log return: host port startseconds server-id
+;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
+;; example of what it's looking for in the log file:
+;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
 
 (define (server:logf-get-start-info logf)
   (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
         (dbprep-rx (regexp "^SERVER: dbprep"))
         (dbprep-found 0)) 
@@ -201,18 +202,18 @@
 			      (string->number (caddr dat))
                               (cadr (cddr dat))))))
                 (begin 
                    (if dbprep-found
                       (begin
-                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (current-seconds))
-                         (thread-sleep! 25)
+                         (debug:print-info 0 *default-log-port* "Server is in dbprep at " (common:human-time))
+                         (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting?
                       )
-                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (current-seconds))
+                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))
                    )
 		    (list #f #f #f #f)))))))))
 
-;; get a list of servers with all relevant data
+;; get a list of servers from the log files, with all relevant data
 ;; ( mod-time host port start-time pid )
 ;;
 (define (server:get-list areapath #!key (limit #f))
   (let ((fname-rx    (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
 	(day-seconds (* 24 60 60)))
@@ -234,11 +235,11 @@
 	(let* ((server-logs-cmd  (conc "grep -iL exiting " areapath "/logs/server-*-*.log"))
                (server-logs   (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all))))
 	       (num-serv-logs (length server-logs)))
 	  (if (or (null? server-logs) (= num-serv-logs 0))
               (let ()
-                 (debug:print 1  *default-log-port* "There are no servers running")
+                 (debug:print 1  *default-log-port* "There are no servers running at " (common:human-time))
 	         '()
               )
 	      (let loop ((hed  (string-chomp (car server-logs)))
 			 (tal  (cdr server-logs))
 			 (res '()))
@@ -365,66 +366,69 @@
   (if *my-client-signature* *my-client-signature*
       (let ((sig (server:mk-signature)))
         (set! *my-client-signature* sig)
         *my-client-signature*)))
 
-;; wait for server=start-last to be three seconds old
+
+;; if server-start-last exists, and wasn't old enough, wait <idle time>, then call this function recursively until it is old enough.
+;; if it is old enough, overwrite it and wait 0.25 seconds.
+;; if it then has the wrong server key, wait <idle time> and call this function recursively.
 ;;
 (define (server:wait-for-server-start-last-flag areapath)
   (let* ((start-flag (conc areapath "/logs/server-start-last"))
 	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
-	 (reftime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
+	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
 	 (server-key (conc (get-host-name) "-" (current-process-id))))
     (if (file-exists? start-flag)
 	(let* ((fmodtime (file-modification-time start-flag))
 	       (delta    (- (current-seconds) fmodtime))
-	       (all-go   (> delta reftime)))
-	  (if (and all-go
+	       (old-enough   (> delta idletime))
+              )
+
+          ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
+          ;; 
+	  (if (and old-enough
 		   (begin
                      (debug:print-info 0 *default-log-port* "Writing " start-flag)
-		     (with-output-to-file start-flag
-		       (lambda ()
-			 (print server-key)))
+		     (with-output-to-file start-flag (lambda () (print server-key)))
 		     (thread-sleep! 0.25)
-		     (let ((res (with-input-from-file start-flag
-				  (lambda ()
-				    (read-line)))))
-		       (equal? server-key res))))
-	      #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
+		     (let ((res (with-input-from-file start-flag (lambda () (read-line)))))
+		       (equal? server-key res)))
+                )
+	      #t
+
+           ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time>, then call this function recursively. 
 	      (begin
 		(debug:print-info 0 *default-log-port* "Gating server start, last start: "
-				  fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
-		(thread-sleep! reftime)
+				  (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "server key does not match" "too soon to start another server"))
+
+		(thread-sleep! idletime)
 		(server:wait-for-server-start-last-flag areapath)))))))
 
-;; kind start up of servers, wait 40 seconds before allowing another server for a given
-;; run-id to be launched
+
+        
+;; kind start up of server, wait before allowing another server for a given
+;; area to be launched
 ;;
 (define (server:kind-run areapath)
   ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
-  ;; and wait for it to be at least 3 seconds old
+  ;; and wait for it to be at least <server idletime> seconds old
   (server:wait-for-server-start-last-flag areapath)
   (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
-      (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
-	     (call-num     (car last-run-dat))
-	     (when-run     (cadr last-run-dat))
-	     (run-delay    (+ (case call-num
-				((0)    0)
-				((1)   20)
-				((2)  300)
-				(else 600))
-			      (random 5)))   ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
+      (let* (
 	     (lock-file    (conc areapath "/logs/server-start.lock")))
-	(if	(> (- (current-seconds) when-run) run-delay)
-		(let* ((start-flag (conc areapath "/logs/server-start-last")))
-		  (common:simple-file-lock-and-wait lock-file expire-time: 15)
-                  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
-		  (system (conc "touch " start-flag)) ;; lazy but safe
-		  (server:run areapath)
-		  (thread-sleep! 2) ;; don't release the lock for at least a few seconds
-		  (common:simple-file-release-lock lock-file)))
-	(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
+	(let* ((start-flag (conc areapath "/logs/server-start-last")))
+	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
+	  (debug:print-info  0 *default-log-port* "server:kind-run: touching " start-flag)
+	  (system (conc "touch " start-flag)) ;; lazy but safe
+	  (server:run areapath)
+	  (thread-sleep! 18) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
+	  (common:simple-file-release-lock lock-file)))
+
+      (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")
+   )
+)
 
 ;; this one seems to be the general entry point
 ;;
 (define (server:start-and-wait areapath #!key (timeout 60))
   (let ((give-up-time (+ (current-seconds) timeout)))

Index: utils/mk_wrapper
==================================================================
--- utils/mk_wrapper
+++ utils/mk_wrapper
@@ -93,11 +93,9 @@
 
 # echo "#!/bin/bash" > $target
 # echo "exec $prefix/bin/.\$(lsb_release -sr)/$cmd \"\$@\"" >> $target
 
 echo "lsbr=\$(lsb_release -sr)" >> $target
-if [ "$LD_LIBRARY_PATH" != "" ];then
-  echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target
-fi
+echo "source $prefix/bin/.\$lsbr/cfg.sh" >> $target
 
 # echo "if [[ -e \$lsbr ]];then source \$lsbr;fi" >> $target
 echo "exec $prefix/bin/.\$lsbr/$cmd \"\$@\"" >> $target