Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,49 @@
 #     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 TODO
 ====
 
+23WW48
+. Allow two or three servers to run for any given db
+. Update avg call count/sec every 30 sec in no-sync
+. get server uses no-sync process info to decide which server to suggest
+. Use process table to decide who will do sync back
+. Fix metadat being synced over and over
+
+23WW47
+. Finding server
+.. look at .servinfo for likely prime main
+.. ask the .servinfo prime main for real prime main
+.. save prime main (for how long, 10 seconds or 10 minutes?)
+
+. Starting prime main
+.. get servinfo files - START
+.. no files? create my servinfo file, goto START
+.. have files? am I the prime main according to servinfo files?
+.. no, I'm not the prime main, ping prime main
+.. ping is good, prime main exists, register self as server if on same host as prime main DONE
+.. no pirng response, remove the .servinfo file - goto START
+.. if I am prime main according to .servinfo files, register directly in no-sync
+
+. Starting non-main
+.. get servinfo files
+.. no files? launch server for main.db
+.. have files? pick out prime main
+.. register self as server with prime main
+
+23WW46 - v1.80 branch
+. Use file semaphore to kill tests, eliminate db load of the KILLREQ query
+. Merge this change to revolution branch
+23WW45 - the revolution branch
+. Add "fast" db start option (no handshaking over NFS)
+. Add server-ro to server types (just "server" is fine for read/write).
+. [DONE] Create pause-server and resume-server calls
+. Create rsync or cp sync to MTRAH function
+. Change rmt:send-receive to divert calls to read-only server when possible
+. [DONE] Change start server to call main.db server for 1..N.db servers, block until server is read for use.
+
 23WW21
 . Dashboard needs its own cache db in /tmp
 
 23WW07
 . Remove use of *dbstruct-dbs*

Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -35,11 +35,12 @@
 
 (use srfi-69
      srfi-18
      posix
      matchable
-     s11n)
+     s11n
+     typed-records)
 
 ;; allow these queries through without starting a server
 ;;
 (define api:read-only-queries
   '(get-key-val-pairs
@@ -152,78 +153,10 @@
     tasks-set-state-given-param-key
     ))
 
 (define *db-write-mutexes* (make-hash-table))
 (define *server-signature* #f)
-;; ;; These are called by the server on recipt of /api calls
-;; ;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
-;; ;;
-;; ;;    - returns #( flag result )
-;; ;;
-;; (define (api:execute-requests dbstruct dat)
-;;   (if (> *api-process-request-count* 50)
-;;       (begin
-;; 	(if (common:low-noise-print 30 "too many threads")
-;; 	    (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay."))
-;; 	;; (thread-sleep! 0.5) ;; take a nap - no, the napping is moved to the clients via tt:backoff-incr
-;; 	))
-;;   (cond
-;;    ((not (vector? dat))                    ;; it is an error to not receive a vector
-;;     (vector #f (vector #f "remote must be called with a vector")))
-;;    (else  
-;;     (let* ((cmd-in            (vector-ref dat 0))
-;;            (cmd               (if (symbol? cmd-in)
-;; 				  cmd-in
-;; 				  (string->symbol cmd-in)))
-;;            (params            (vector-ref dat 1))
-;; 	   (run-id            (if (null? params)
-;; 				  0
-;; 				  (car params)))
-;; 	   (write-mutex       (if (hash-table-exists? *db-write-mutexes* run-id)
-;; 				  (hash-table-ref *db-write-mutexes* run-id)
-;; 				  (let* ((newmutex (make-mutex)))
-;; 				    (hash-table-set! *db-write-mutexes* run-id newmutex)
-;; 				    newmutex)))
-;;            (start-t           (current-milliseconds))
-;;            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
-;;            (readonly-command  (member cmd api:read-only-queries))
-;;            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))))
-;;       (if (not readonly-command)
-;; 	  (mutex-lock! write-mutex))
-;;       (let* ((tmppath    (dbr:dbstruct-tmppath  dbstruct))
-;; 	     (clean-run-id (cond
-;; 			    ((number? run-id)   run-id)
-;; 			    ((equal? run-id #f) "main")
-;; 			    (else               "other")))
-;; 	     (crumbfile  (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params)))
-;; 	     (res    
-;;               (if writecmd-in-readonly-mode
-;;                   (conc "attempt to run write command "cmd" on a read-only database")
-;; 		  (api:dispatch-request dbstruct cmd run-id params))))
-;; 	(delete-file* crumbfile)
-;; 	(if (not readonly-command)
-;; 	    (mutex-unlock! write-mutex))
-;; 	
-;; 	;; save all stats
-;; 	(let ((delta-t (- (current-milliseconds)
-;; 			  start-t))
-;; 	      (modified-cmd (if (eq? cmd 'general-call)
-;; 				(string->symbol (conc "general-call-" (car params)))
-;; 				cmd)))
-;; 	  (hash-table-set! *db-api-call-time* modified-cmd
-;; 			   (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '()))))
-;; 	(if writecmd-in-readonly-mode
-;;             (begin
-;;               #;(common:telemetry-log (conc "api-out:"(->string cmd))
-;;               payload: `((params . ,params)
-;;               (ok-res . #t)))
-;; 	      (vector #f res))
-;;             (begin
-;;               #;(common:telemetry-log (conc "api-out:"(->string cmd))
-;;               payload: `((params . ,params)
-;;               (ok-res . #f)))
-;;               (vector #t res))))))))
 
 (define *api-threads* '())
 (define (api:register-thread th-in)
   (set! *api-threads* (cons (cons th-in (current-seconds)) *api-threads*)))
 
@@ -237,11 +170,25 @@
 				(not (member (thread-state (car thdat)) '(terminated dead))))
 			      *api-threads*)))
 
 (define (api:get-count-threads-alive)
   (length *api-threads*))
-  
+
+(define *api:last-stats-print* 0)
+(define *api-print-db-stats-mutex* (make-mutex))
+(define (api:print-db-stats)
+  (debug:print-info 0 *default-log-port* "Started periodic db stats printer")
+  (let loop ()
+    (mutex-lock! *api-print-db-stats-mutex*)
+    (if (> (- (current-seconds) *api:last-stats-print*) 15)
+	(begin
+	  (rmt:print-db-stats)
+	  (set! *api:last-stats-print* (current-seconds))))
+    (mutex-unlock! *api-print-db-stats-mutex*)
+    (thread-sleep! 5)
+    (loop)))
+
 
 ;; indat is (cmd run-id params meta)
 ;;
 ;; WARNING: Do not print anything in the lambda of this function as it
 ;;          reads/writes to current in/out port
@@ -250,85 +197,105 @@
   (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
   (if (not *server-signature*)
       (set! *server-signature* (tt:mk-signature *toppath*)))
   (lambda (indat)
     (api:register-thread (current-thread))
-    (let* (;; (indat      (deserialize))
-	   (newcount   (+ *api-process-request-count* 1))
-	   (numthreads (api:get-count-threads-alive))
-	   (delay-wait (if (> newcount 10)
-			   (- newcount 10)
-			   0))
-	   (normal-proc (lambda (cmd run-id params)
-			  (case cmd
-			    ((ping) *server-signature*)
-			    (else
-			     (api:dispatch-request dbstruct cmd run-id params))))))
-      (set! *api-process-request-count* newcount)
-      (set! *db-last-access* (current-seconds))
-      (if (not (eq? newcount numthreads))
-	  (begin
-	    (api:remove-dead-or-terminated)
-	    (let ((threads-now (api:get-count-threads-alive)))
-	      (debug:print 0 *default-log-port* "WARNING: newcount="newcount", numthreads="numthreads", remaining="threads-now)
-	      (set! newcount threads-now))))
-      (match indat
-	((cmd run-id params meta)
-	 (let* ((db-ok  (let* ((dbfname (dbmod:run-id->dbfname run-id))
-			       (ok      (equal? dbfname (dbr:dbstruct-dbfname dbstruct))))
-			  (case cmd
-			    ((ping) #t) ;; we are fine
-			    (else
-			     (if (not ok)(debug:print 0 *default-log-port* "ERROR: "cmd", run-id "run-id", not correct for dbfname "(dbr:dbstruct-dbfname dbstruct)))
-			     (assert ok "FATAL: database file and run-id not aligned.")))))
-		(ttdat   *server-info*)
-		(server-state (tt-state ttdat))
-		(status  (cond
-			  ((> newcount 3) 'busy)
-			  ;; ((> newcount 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down.
-			  (else 'ok)))
-		(errmsg  (case status
-			   ((busy)   (conc "Server overloaded, "newcount" threads in flight"))
-			   ((loaded) (conc "Server loaded, "newcount" threads in flight"))
-			   (else     #f)))
-		(result  (case status
-			   ((busy)
-			    (if (eq? cmd 'ping)
-				(normal-proc cmd run-id params)
-				;; newcount must be greater than 5 for busy
-				(* 1 (- newcount 3)) ;; was 15
-				)) ;; (- newcount 29)) ;; call back in as many seconds
-			   ((loaded)
-;; 			    (if (eq? (rmt:transport-mode) 'tcp)
-;; 				(thread-sleep! 0.5))
-			    (normal-proc cmd run-id params))
-			   (else
-			    (normal-proc cmd run-id params))))
-		(meta   (case cmd
-			  ((ping) `((sstate . ,server-state)))
-			  (else   `((wait . ,delay-wait)))))
-		(payload (list status errmsg result meta)))
-	   (set! *api-process-request-count* (- *api-process-request-count* 1))
-	   ;; (serialize payload)
-	   (api:unregister-thread (current-thread))
-	   payload))
-	(else
-	 (assert #f "FATAL: failed to deserialize indat "indat))))))
-       
+    (let* ((result 
+	    (let* ((numthreads (api:get-count-threads-alive))
+		   (delay-wait (if (> numthreads 10)
+				   (- numthreads 10)
+				   0))
+		   (normal-proc (lambda (cmd run-id params)
+				  (case cmd
+				    ((ping) *server-signature*)
+				    (else
+				     (api:dispatch-request dbstruct cmd run-id params))))))
+	      (set! *api-process-request-count* numthreads)
+	      (set! *db-last-access* (current-seconds))
+;; 	      (if (not (eq? numthreads numthreads))
+;; 	      (begin
+;; 	      (api:remove-dead-or-terminated)
+;; 	      (let ((threads-now (api:get-count-threads-alive)))
+;; 	      (debug:print 0 *default-log-port* "WARNING: numthreads="numthreads", numthreads="numthreads", remaining="threads-now)
+;; 	      (set! numthreads threads-now))))
+	      (match indat
+		     ((cmd run-id params meta)
+		      (let* ((start-t (current-milliseconds))
+			     (db-ok  (let* ((dbfname (dbmod:run-id->dbfname run-id))
+					    (ok      (equal? dbfname (dbr:dbstruct-dbfname dbstruct))))
+				       (case cmd
+					 ((ping) #t) ;; we are fine
+					 (else
+					  (assert ok "FATAL: database file and run-id not aligned.")))))
+			     (ttdat   *server-info*)
+			     (server-state (tt-state ttdat))
+			     (maxthreads   20) ;; make this a parameter?
+			     (status  (cond
+				       ((and (> numthreads maxthreads)
+					     (> (random 100) 70)) ;; allow a 30% probability to go through so we can figure out what is going wrong in main.db server.
+					'busy)
+				       ;; ((> numthreads 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down.
+				       (else 'ok)))
+			     (errmsg  (case status
+					((busy)   (conc "Server overloaded, "numthreads" threads in flight"))
+					((loaded) (conc "Server loaded, "numthreads" threads in flight"))
+					(else     #f)))
+			     (result  (case status
+					((busy)
+					 (if (eq? cmd 'ping)
+					     (normal-proc cmd run-id params)
+					     ;; numthreads must be greater than 5 for busy
+					     (* 0.1 (- numthreads maxthreads)) ;; was 15
+					     )) ;; (- numthreads 29)) ;; call back in as many seconds
+					((loaded)
+					 ;; 			    (if (eq? (rmt:transport-mode) 'tcp)
+					 ;; 				(thread-sleep! 0.5))
+					 (normal-proc cmd run-id params))
+					(else
+					 (normal-proc cmd run-id params))))
+			     (meta   (case cmd
+				       ((ping) `((sstate . ,server-state)))
+				       (else   `((wait . ,delay-wait)))))
+			     (payload (list status errmsg result meta)))
+			;; (cmd run-id params meta)
+			(db:add-stats cmd run-id params (- (current-milliseconds) start-t))
+			payload))
+		     (else
+		      (assert #f "FATAL: failed to deserialize indat "indat))))))
+      ;; (set! *api-process-request-count* (- *api-process-request-count* 1))
+      ;; (serialize payload)
+     
+      (api:unregister-thread (current-thread))
+      result)))
+
+
+
+(define *api-halt-writes* #f)
 
 (define (api:dispatch-request dbstruct cmd run-id params)
   (if (not *no-sync-db*)
       (db:open-no-sync-db))
+  (let* ((start-time (current-milliseconds)))
+    (if (member cmd api:write-queries)
+	(let loop ()
+	  (if *api-halt-writes*
+	      (begin
+		(thread-sleep! 0.2)
+		(if (< (- (current-milliseconds) start-time)
+		       5000) ;; hope it don't take more than five seconds to sync
+		    (loop-time)
+		    #;(debug:print 0 *default-log-port* "ERROR: writes halted for more than 5 seconds, sync might be taking too long"))))))
+    (db:add-stats 'api-write-blocking-for-sync run-id params (- (current-milliseconds) start-time)))
   (case cmd
     ;;===============================================
     ;; READ/WRITE QUERIES
     ;;===============================================
 
     ((get-keys-write)                        (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl
     
     ;; SERVERS
-    ((start-server)                    (apply server:kind-run params))
+    ((start-server)                    (apply tt:server-process-run params))
     ((kill-server)                     (set! *server-run* #f))
 
     ;; TESTS
 
     ;;((test-set-state-status-by-id)     (apply mt:test-set-state-status-by-id dbstruct params))
@@ -513,43 +480,5 @@
     ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
     (else
      (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
      (conc "ERROR: BAD api call " cmd))))
 
-;; http-server  send-response
-;;                 api:process-request
-;;                    db:*
-;;
-;; NB// Runs on the server as part of the server loop
-;;
-(define (api:process-request dbstruct $) ;; the $ is the request vars proc
-  (debug:print 4 *default-log-port* "server-id:"  *server-id*)
-  (let* ((cmd     ($ 'cmd))
-	 (paramsj ($ 'params))
-         (key     ($ 'key))   
-	 (params  (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?)
-    (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key)
-    (if (equal? key *server-id*)
-      (begin
-        (set! *api-process-request-count* (+ *api-process-request-count* 1))
- 	(let* ((resdat  (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result )
-	       (success (vector-ref resdat 0))
-	       (res     (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?)
-	  (debug:print 4 *default-log-port* "res:" res)
-	  (if (not success)
-	      (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params))
-	  (if (> *api-process-request-count* *max-api-process-requests*)
-	      (set! *max-api-process-requests* *api-process-request-count*))
-	  (set! *api-process-request-count* (- *api-process-request-count* 1))
-	  ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds
-	  ;; (rmt:dat->json-str
-	  ;;  (if (or (string? res)
-	  ;;          (list?   res)
-	  ;;          (number? res)
-	  ;;          (boolean? res))
-	  ;;      res 
-	  ;;      (list "ERROR, not string, list, number or boolean" 1 cmd params res)))))
-	  (db:obj->string res transport: 'http)))
-	(begin
-	  (debug:print 0 *default-log-port*   "Server refused to process request. Server id mismatch. recived " key " expected:  " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) 
-	  (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http)))))
-

Index: archive.scm
==================================================================
--- archive.scm
+++ archive.scm
@@ -359,11 +359,11 @@
 	(archive-dir  (if archive-info (cdr archive-info) #f))
 	(archive-id   (if archive-info (car archive-info) -1))
         (home-host    (server:choose-server *toppath* 'homehost))
         (archive-time (seconds->std-time-str (current-seconds)))
         (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
-        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
+        (tmp-db-path (conc (dbfile:make-tmpdir-name *toppath* "") "/megatest.db"))
         (dbfile             (conc  archive-staging-db "/megatest.db"))) 
         (create-directory archive-staging-db #t)
         (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
             (if (eq? exit-code 0)   
               (case archiver

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -21,10 +21,11 @@
 (declare (unit common))
 (declare (uses commonmod))
 (declare (uses rmtmod))
 (declare (uses debugprint))
 (declare (uses mtargs))
+        
 
 (use srfi-1 data-structures posix regex-case (prefix base64 base64:)
      format dot-locking csv-xml z3 udp ;; sql-de-lite
      hostinfo md5 message-digest typed-records directory-utils stack
      matchable regex posix (srfi 18) extras ;; tcp 
@@ -153,14 +154,10 @@
 (define *time-zero* (current-seconds)) ;; for the watchdog
 (define *on-exit-procs* '()) ;; add procs to this list to be executed on exit
 (define *default-area-tag* "local")
 
 ;; DATABASE
-;; (define *dbstruct-dbs*         #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this.
-;; db stats
-(define *db-stats*            (make-hash-table)) ;; hash of vectors < count duration-total >
-(define *db-stats-mutex*      (make-mutex))
 ;; db access
 (define *db-last-access*      (current-seconds)) ;; last db access, used in server
 ;; (define *db-write-access*     #t)
 ;; db sync
 ;; (define *db-last-sync*        0)                 ;; last time the sync to megatest.db happened
@@ -181,11 +178,10 @@
 ;; (define *max-cache-size*    0)
 (define *logged-in-clients* (make-hash-table))
 (define *server-id*         #f)
 (define *server-info*       #f)  ;; good candidate for easily convert to non-global
 (define *time-to-exit*      #f)
-(define *server-run*        #t)
 (define *run-id*            #f)
 (define *server-kind-run*   (make-hash-table))
 (define *home-host*         #f)
 ;; (define *total-non-write-delay* 0)
 (define *heartbeat-mutex*   (make-mutex))
@@ -247,11 +243,11 @@
 (define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
 (define *common:this-exe-dir*      (pathname-directory *common:this-exe-fullpath*))
 (define *common:this-exe-name*     (pathname-strip-directory *common:this-exe-fullpath*))
 
 (define (common:get-sync-lock-filepath)
-  (let* ((tmp-area     (common:get-db-tmp-area))
+  (let* ((tmp-area     (common:make-tmpdir-name *toppath* ""))
          (lockfile     (conc tmp-area "/megatest.db.lock")))
     lockfile))
 
 (define *common:logpro-exit-code->status-sym-alist*
   '( ( 0 . pass )
@@ -1533,11 +1529,11 @@
 ;;
 (define (common:lazy-modification-time fpath)
   (handle-exceptions
       exn
     (begin
-      (debug:print 0 *default-log-port* "Failed to get modifcation time for " fpath ", treating it as zero. exn=" exn)
+      (debug:print 2 *default-log-port* "Failed to get modification time for " fpath ", treating it as zero. exn=" exn)
       0)
     (if (file-exists? fpath)
 	(file-modification-time fpath)
 	0)))
 
@@ -2280,11 +2276,11 @@
 (define (common:check-db-dir-space)
   (let* ((required (string->number 
                     ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks.
 		    (or (configf:lookup *configdat* "setup" "dbdir-space-required")
 			"1000000")))
-	 (dbdir    (common:get-db-tmp-area)) ;; (db:get-dbdir))
+	 (dbdir    (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir))
 	 (tdbspace (common:check-space-in-dir dbdir required))
 	 (mdbspace (common:check-space-in-dir *toppath* required)))
     (sort (list tdbspace mdbspace) (lambda (a b)
 				     (< (cadr a)(cadr b))))))
 

Index: commonmod.scm
==================================================================
--- commonmod.scm
+++ commonmod.scm
@@ -160,10 +160,17 @@
 	'()))) ;; should it return empty list or #f to indicate not set?
 
 
 (define (get-section cfgdat section)
   (hash-table-ref/default cfgdat section '()))
+
+(define (common:make-tmpdir-name areapath tmpadj)
+  (let* ((area (pathname-file areapath))
+         (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb")))
+    (unless (directory-exists? dname)
+      (create-directory dname #t))
+    dname))
 
 ;; dot-locking egg seems not to work, using this for now
 ;; if lock is older than expire-time then remove it and try again
 ;; to get the lock
 ;;

Index: dashboard-tests.scm
==================================================================
--- dashboard-tests.scm
+++ dashboard-tests.scm
@@ -463,11 +463,11 @@
 
 ;;======================================================================
 ;;
 ;;======================================================================
 (define (dashboard-tests:examine-test run-id test-id) ;; run-id run-key origtest)
-  (let* ((db-path       (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
+  (let* ((db-path       (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
 	 (dbstruct      #f) ;; NOT USED
 	 (testdat        (rmt:get-test-info-by-id run-id test-id)) ;; (db:get-test-info-by-id dbstruct run-id test-id))
 	 (db-mod-time   0) ;; (file-modification-time db-path))
 	 (last-update   0) ;; (current-seconds))
 	 (request-update #t))

Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -402,12 +402,12 @@
     (dboard:setup-tabdat dat)
     (dboard:setup-num-rows dat)
     dat))
 
 (define (dboard:setup-tabdat tabdat)
-  (dboard:tabdat-dbdir-set! tabdat (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
-  (dboard:tabdat-dbfpath-set! tabdat (db:dbfile-path))
+  (dboard:tabdat-dbdir-set! tabdat (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
+  (dboard:tabdat-dbfpath-set! tabdat (common:make-tmpdir-name *toppath* ""))
   (dboard:tabdat-monitor-db-path-set! tabdat (conc (dboard:tabdat-dbdir tabdat) "/monitor.db"))
 
 
   ;; HACK ALERT: this is a hack, please fix.
   (dboard:tabdat-ro-set! tabdat (not (file-read-access? (dboard:tabdat-dbfpath tabdat))))
@@ -928,16 +928,16 @@
 		  (hash-table-set!    (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
 	      (if (or (null? tal)
 		      (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
 		  (begin
 		    (when (> elapsed-time 2)   
-                      (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
+                      (debug:print 2 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
                       (let* ((old-val (iup:attribute *tim* "TIME"))
                              (new-val (number->string (inexact->exact (floor (* 2  (string->number old-val)))))))
                         (if (< (string->number new-val) 5000)
                             (begin
-			      (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
+			      (debug:print 2 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
 			      (iup:attribute-set! *tim* "TIME" new-val)))))
 		    (dboard:tabdat-allruns-set! tabdat new-res)
 		    maxtests)
 		  (if (> (dboard:rundat-run-data-offset run-struct) 0)
 		      (loop run tal new-res newmaxtests) ;; not done getting data for this run
@@ -2404,20 +2404,21 @@
                               )
                             
                              )) "runs-summary-click-callback"))))
 	 (runs-summary-updater  
           (lambda ()
-	    (mutex-lock! update-mutex)
+	    ;; (mutex-lock! update-mutex)
             (if  (or (dashboard:database-changed? commondat tabdat context-key: 'runs-summary-updater)
                      (dboard:tabdat-view-changed tabdat))
                  (debug:catch-and-dump
                   (lambda () ;; check that run-matrix is initialized before calling the updater
 		    (if run-matrix 
 			(dashboard:runs-summary-updater commondat tabdat tb cell-lookup run-matrix)))
                   "dashboard:runs-summary-updater")
                  )
-	    (mutex-unlock! update-mutex)))
+	    #;(mutex-unlock! update-mutex)
+	    ))
          (runs-summary-control-panel (dashboard:runs-summary-control-panel tabdat))
          )
     (dboard:commondat-add-updater commondat runs-summary-updater tab-num: tab-num)
     (dboard:tabdat-runs-tree-set! tabdat tb)
     (iup:vbox
@@ -3112,11 +3113,11 @@
      (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: "
 		  ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn)
      (current-seconds)) ;; something went wrong - just print an error and return current-seconds
    (common:max (map (lambda (filen)
 		      (file-modification-time filen))
-		    (glob (conc dbdir "/*.db*"))))))
+		    (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db")))))))
 
 (define (dashboard:monitor-changed? commondat tabdat)
   (let* ((run-update-time (current-seconds))
 	 (monitor-db-path (dboard:tabdat-monitor-db-path tabdat))
 	 (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path))
@@ -3344,14 +3345,14 @@
 	(vch (dboard:tabdat-view-changed tabdat)))
     (if (and cnv dwg vch)
 	(begin
 	  (vg:drawing-xoff-set! dwg (dboard:tabdat-xadj tabdat))
 	  (vg:drawing-yoff-set! dwg (dboard:tabdat-yadj tabdat))
-	  (mutex-lock! mtx)
+	  ;; (mutex-lock! mtx)
 	  (canvas-clear! cnv)
 	  (vg:draw dwg tabdat)
-	  (mutex-unlock! mtx)
+	  ;; (mutex-unlock! mtx)
 	  (dboard:tabdat-view-changed-set! tabdat #f)))))
   
 ;; doesn't work.
 ;;
 ;;(define (gotoescape tabdat escape)
@@ -3631,17 +3632,17 @@
 			       (graph-uly  (- (calc-y 0) canvas-margin))
 			       (sec-per-50pt (/ 50 timescale))
 			       )
 			  ;; (print "timeoffset: " timeoffset " timescale: " timescale " run-duration: " (seconds->hr-min-sec run-duration) " width: " width " sec-per-50pt: " sec-per-50pt)
 			  ;; (print "timescale: " timescale " timeoffset: " timeoffset " sizex: " sizex " originx: " originx)
-			  (mutex-lock! mtx)
+			  ;; (mutex-lock! mtx)
 			  (vg:add-comp-to-lib runslib run-full-name runcomp)
 			  ;; Have to keep moving the instantiated box as it is anchored at the lower left
 			  ;; this should have worked for x in next statement? (maptime run-start)
 			  ;; add 60 to make room for the graph
 			  (vg:instantiate drawing "runslib" run-full-name run-full-name 8 (- (calc-y curr-run-start-row) (+ 5 graph-height run-to-run-margin)))
-			  (mutex-unlock! mtx)
+			  ;; (mutex-unlock! mtx)
 			  ;; (set! run-start-row (+ max-row 2))
 			  ;; (dboard:tabdat-start-row-set! tabdat (+ new-run-start-row 1))
 			  ;; get tests in list sorted by event time ascending
 			  (let testsloop ((test-ids  (car hierdat))              ;; loop on tests (NOTE: not items!)
 					  (tests-tal (cdr hierdat))
@@ -3742,13 +3743,13 @@
 				 (outln     (vg:make-rect-obj -5 lly ulx uly 
 							      text: run-full-name
 							      line-color:  (vg:rgb->number  255 0 255 a: 128))))
 					;  (vg:components-get-extents d1 c1)))
 			    ;; this is the box around the run
-			    (mutex-lock! mtx)
+			    ;; (mutex-lock! mtx)
 			    (vg:add-obj-to-comp runcomp outln)
-			    (mutex-unlock! mtx)
+			    ;; (mutex-unlock! mtx)
 			    ;; this is where we have enough info to place the graph
 			    (dboard:graph commondat tabdat tab-num -5 (+ uly 10) ulx (+ uly graph-height 3) run-start run-end timescale maptime run-full-name canvas-margin)
 			    (dboard:tabdat-max-row-set! tabdat (+ (dboard:tabdat-max-row tabdat)(quotient (+ graph-height 40 3) row-height)))
 			    ;; (vg:instance-move drawing run-full-name 0 (dboard:tabdat-max-row tabdat))
 			    ))
@@ -3887,22 +3888,22 @@
 	;; tab-num: 2)
 	(iup:callback-set! *tim*
 			   "ACTION_CB"
 			   (lambda (time-obj)
 			     (let ((update-is-running #f))
-			     (mutex-lock! (dboard:commondat-update-mutex commondat))
-			     (set! update-is-running (dboard:commondat-updating commondat))
-			     (if (not update-is-running)
-			     (dboard:commondat-updating-set! commondat #t))
-			     (mutex-unlock! (dboard:commondat-update-mutex commondat))
-			     (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
+			       ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+			       (set! update-is-running (dboard:commondat-updating commondat))
+			       (if (not update-is-running)
+				   (dboard:commondat-updating-set! commondat #t))
+			       ;; (mutex-unlock! (dboard:commondat-update-mutex commondat))
+			       (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
 			     (begin
 			     (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
-			     (mutex-lock! (dboard:commondat-update-mutex commondat))
+			     ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
 			     (dboard:commondat-updating-set! commondat #f)
-			     (mutex-unlock! (dboard:commondat-update-mutex commondat)))
-				   ))
+			     ;; (mutex-unlock! (dboard:commondat-update-mutex commondat))
+			     )))
 			     1))))
       ;; (debug:print 0 *default-log-port* "Starting updaters")
       (let ((th1 (make-thread (lambda ()
 				(thread-sleep! 1)
 				(dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab 

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -133,11 +133,11 @@
 	   default)))
    (apply sqlite3:first-result db stmt params)))
 
 (define (db:setup do-sync)
   (assert *toppath* "FATAL: db:setup called before launch:setup has been run.")
-  (let* ((tmpdir (common:get-db-tmp-area)))
+  (let* ((tmpdir (common:make-tmpdir-name *toppath* "")))
     (if (not *dbstruct-dbs*)
 	(dbfile:setup do-sync *toppath* tmpdir)
 	*dbstruct-dbs*)))
 
 ;; moved from dbfile
@@ -267,17 +267,10 @@
 		     ", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
 		     ", location: "  ((condition-property-accessor 'exn 'location)  exn)
 		     ))
 
 
-;; NB// #f => return dbdir only
-;;      (was planned to be;  zeroth db with name=main.db)
-;; 
-;; If run-id is #f return to create and retrieve the path where the db will live.
-;;
-(define db:dbfile-path common:get-db-tmp-area)
-
 (define (db:set-sync db)
   (let ((syncprag (configf:lookup *configdat* "setup" "sychronous")))
     (sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";")))) 
 
 
@@ -467,11 +460,11 @@
 	 (get-mtime shm-file))))
 	 
 ;; (define (db:all-db-sync dbstruct)
 ;;   (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db))
 ;; 	 (data-synced       0) ;; count of changed records
-;;     (tmp-area       (common:get-db-tmp-area))
+;;     (tmp-area       (common:make-tmpdir-name *toppath*))
 ;;     (dbfiles        (glob (conc tmp-area"/.mtdb/*.db")))
 ;;     (sync-durations (make-hash-table))
 ;;     (no-sync-db        (db:open-no-sync-db)))
 ;;     (for-each
 ;;      (lambda (file) ;; tmp db file
@@ -556,11 +549,11 @@
 ;;  run-ids: '(1 2 3 ...) or #f (for all)
 ;;
 (define (db:multi-db-sync dbstruct . options)
   (let* (;; (dbdat       (db:open-db dbstruct #f dbfile:db-init-proc))
 	 (data-synced 0) ;; count of changed records
-	 (tmp-area       (common:get-db-tmp-area))
+	 (tmp-area       (common:make-tmpdir-name *toppath* ""))
 	 (old2new (member 'old2new options))
 	 (dejunk (member 'dejunk options))
 	 (killservers (member 'killservers options))
 	 (src-area (if old2new *toppath* tmp-area))
 	 (dest-area (if old2new tmp-area *toppath*))
@@ -1254,11 +1247,11 @@
 ;; no-sync.db - small bits of data to be shared between servers
 ;;======================================================================
 
 (define (db:get-dbsync-path)
   (case (rmt:transport-mode)
-    ((http)(common:get-db-tmp-area))
+    ((http)(common:make-tmpdir-name *toppath* ""))
     ((tcp) (conc *toppath*"/.mtdb"))
     ((nfs) (conc *toppath*"/.mtdb"))
     (else "/tmp/dunno-this-gonna-exist")))
 
 ;; This is needed for api.scm
@@ -1580,11 +1573,11 @@
 ;; TODO: Switch this to use max(update_time) from each run db? Then if using a server there is no disk traffic (using cachedb db) ???
 ;;
 ;; NOTE: This DOESN'T (necessarily) get the real run ids, but the number of the <number>.db!!
 
 (define (db:get-changed-run-ids since-time)
-  (let* ((dbdir      (db:dbfile-path)) ;; (configf:lookup *configdat* "setup" "dbdir"))
+  (let* ((dbdir      (common:make-tmpdir-name *toppath* "")) ;; (configf:lookup *configdat* "setup" "dbdir"))
 	 (alldbs     (glob (conc *toppath* "/.mtdb/[0-9]*.db*")))
 	 (changed    (filter (lambda (dbfile)
 			       (> (file-modification-time dbfile) since-time))
 			     alldbs)))
     (delete-duplicates
@@ -2234,21 +2227,24 @@
 		   qry
 		   run-id
 		   (or last-update 0))))))
 
 (define (db:get-testinfo-state-status dbstruct run-id test-id)
-  (let ((res            #f))
-    (db:with-db dbstruct run-id #f
-		(lambda (dbdat db)
-		  (sqlite3:for-each-row
-		   (lambda (run-id testname item-path state status)
-		     ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment
-		     (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-")))
-		   db 
-		   "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" 
-		   test-id run-id)))
-    res))
+  (db:with-db
+   dbstruct run-id #f
+   (lambda (dbdat db)
+     (let* ((res   #f)
+	    (stmth (db:get-cache-stmth dbdat db "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;")))
+       (sqlite3:for-each-row
+	(lambda (run-id testname item-path state status)
+	  ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment
+	  (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-")))
+	;; db 
+	;; "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;"
+	stmth
+	test-id run-id)
+       res))))
 
 ;; get a useful subset of the tests data (used in dashboard
 ;; use db:mintest-get-{id ,run_id,testname ...}
 ;;
 (define (db:get-tests-for-run-mindata dbstruct run-id testpatt states statuses not-in)
@@ -2637,18 +2633,18 @@
   (db:with-db
    dbstruct
    run-id
    #f
    (lambda (dbdat db)
-     (let ((res   (cons #f #f)))
-;;	   (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;")))
-       (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
-	(lambda (state status)
-	  (cons state status))
-	db
-	"SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue
-	test-id run-id)
+     (let ((res   (cons #f #f))
+	   (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=? AND run_id=?;")))
+	  (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
+	   (lambda (state status)
+	     (cons state status))
+	   ;; db
+	   stmth ;;"SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue
+	   test-id run-id)
        res))))
 
 ;; Use db:test-get* to access
 ;; Get test data using test_ids. NB// Only works within a single run!!
 ;;
@@ -3728,25 +3724,35 @@
 	     tags)))
 	db
 	"SELECT testname,tags FROM test_meta")
        (hash-table->alist res)))))
 
+;; testmeta doesn't change, we can cache it for up too an hour
+
+(define *db:testmeta-cache* (make-hash-table))
+(define *db:testmeta-last-update* 0)
+
 ;; read the record given a testname
 (define (db:testmeta-get-record dbstruct testname)
-  (let ((res   #f))
-    (db:with-db
-     dbstruct
-     #f
-     #f
-     (lambda (dbdat db)
-       (sqlite3:for-each-row
-	(lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)
-	  (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)))
-	db
-	"SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;"
-	testname)
-       res))))
+  (if (and (< (- (current-seconds) *db:testmeta-last-update*) 600)
+	   (hash-table-exists? *db:testmeta-cache* testname))
+      (hash-table-ref *db:testmeta-cache* testname)
+      (let ((res   #f))
+	(db:with-db
+	 dbstruct
+	 #f
+	 #f
+	 (lambda (dbdat db)
+	   (sqlite3:for-each-row
+	    (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)
+	      (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)))
+	    db
+	    "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;"
+	    testname)))
+	(hash-table-set! *db:testmeta-cache* testname res)
+	(set! *db:testmeta-last-update* (current-seconds))
+	res)))
 
 ;; create a new record for a given testname
 (define (db:testmeta-add-record dbstruct testname)
   (db:with-db dbstruct #f #t
 	      (lambda (dbdat db)
@@ -4314,11 +4320,11 @@
         ))))
 
 ;; sync for filesystem local db writes
 ;;
 (define (db:run-lock-and-sync no-sync-db)
-  (let* ((tmp-area       (common:get-db-tmp-area))
+  (let* ((tmp-area       (common:make-tmpdir-name *toppath* ""))
 	 (dbfiles        (glob (conc tmp-area"/.mtdb/*.db")))
 	 (sync-durations (make-hash-table)))
     ;; (debug:print-info 0 *default-log-port* "lock-and-sync, dbfiles: "dbfiles)
     (for-each
      (lambda (file)
@@ -4370,11 +4376,11 @@
 	(sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
  	(debug-mode         (debug:debug-mode 1))
  	(last-time          (current-seconds))     ;; last time through the sync loop
  	(no-sync-db         (db:open-no-sync-db))
  	(sync-duration      0)  ;; run time of the sync in milliseconds
-	(tmp-area           (common:get-db-tmp-area)))
+	(tmp-area           (common:make-tmpdir-name *toppath* "")))
     ;; Sync moved to http-transport keep-running loop
     (debug:print-info 2 *default-log-port* "Periodic copy-based sync thread started. syncer is copy-sync, tmp-area is " tmp-area)
     (debug:print-info 3 *default-log-port* "watchdog starting. syncer is copy-sync pid="(current-process-id));;  " this-wd-num="this-wd-num)
     
     (if (and legacy-sync (not *time-to-exit*))
@@ -4478,11 +4484,11 @@
  		       (for-each
  			(lambda (subdb)
  			  (let* (;;(dbstruct (db:setup))
  				 (mtdb       (dbr:subdb-mtdb subdb))
  				 (mtpath     (db:dbdat-get-path mtdb))
- 				 (tmp-area   (common:get-db-tmp-area))
+ 				 (tmp-area   (common:make-tmpdir-name *toppath* ""))
  				 (res        (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
  			    (set! sync-duration (- (current-milliseconds) sync-start))
  			    (if (> res 0) ;; some records were transferred, keep the db alive
  				(begin
  				  (mutex-lock! *heartbeat-mutex*)
@@ -4525,11 +4531,10 @@
 ;; 	       ;; time to exit, close the no-sync db here
 ;; 	       (db:no-sync-close-db no-sync-db stmt-cache)
 	       (if (common:low-noise-print 30)
 		   (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) )))) 
 ))
-
 
 (define (std-exit-procedure)
   ;;(common:telemetry-log-close)
   (on-exit (lambda () 0)) ;; why is this here?
   ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)

Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -242,11 +242,12 @@
           #f
   )
 )
 
 (define (dbfile:make-tmpdir-name areapath tmpadj)
-  (let* ((dname (conc "/tmp/"(current-user-name)"/" (string-translate areapath "/" ".") tmpadj)))
+  (let* ((area (pathname-file areapath))
+         (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb")))
     (unless (directory-exists? dname)
       (create-directory dname #t))
     dname))
 
 (define (dbfile:run-id->path apath run-id)
@@ -487,11 +488,11 @@
 ;; NOTE: this is already protected by mutex *no-sync-db-mutex*
 ;;
 (define (dbfile:raw-open-no-sync-db dbpath)
   (if (not (file-exists? dbpath))
       (create-directory dbpath #t))
-  (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db")
+  (debug:print-info 2 *default-log-port* "(dbfile:raw-open-no-sync-db: Opening "dbpath"/no-sync.db")
   (let* ((dbname    (conc dbpath "/no-sync.db"))
 	 (db-exists (file-exists? dbname))
 	 (init-proc (lambda (db)
 		      (sqlite3:with-transaction
 		       db
@@ -525,18 +526,19 @@
                                  reason TEXT DEFAULT 'none',
                                    CONSTRAINT no_sync_processes UNIQUE (host,pid));"
 			    ))))))
 	 (on-tmp      (equal? (car (string-split dbpath "/")) "tmp"))
 	 (db        (if on-tmp
-			(dbfile:cautious-open-database dbname init-proc 0 "WAL" force-init: #t)
-			(dbfile:cautious-open-database dbname init-proc 0 #f    force-init: #t)
+			(dbfile:cautious-open-database dbname init-proc 1 "WAL" force-init: #t) ;; WAL MODE should use syncronous=1
+			;; (dbfile:cautious-open-database dbname init-proc 0 #f    force-init: #t)
+			(dbfile:cautious-open-database dbname init-proc 0 "MEMORY" force-init: #t) ;; Journal mode = memory is fastest?
 			;; (sqlite3:open-database dbname)
 			)))
-    (if on-tmp	      ;; done in cautious-open-database
-	(begin
-	  (sqlite3:execute db "PRAGMA synchronous = 0;")
-	  (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))))
+    ;; (if on-tmp	      ;; done in cautious-open-database
+    ;;    (begin
+    ;; (sqlite3:execute db "PRAGMA synchronous = 0;") ;; why was this here when is is handled by cautious-open-database?
+    (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) ;; ))
     db))
 
 ;; mtest processes registry calls
 
 (define (dbfile:insert-or-update-process nsdb dat)
@@ -580,18 +582,20 @@
 		   host port pid starttime endtime status purpose dbname mtversion))
 
 (define (dbfile:set-process-status nsdb host pid newstatus)
   (sqlite3:execute nsdb "UPDATE processes SET status=? WHERE host=? AND pid=?;" newstatus host pid))
 
+;; as sorted should be stable. can use to choose "winner"
+;;
 (define (dbfile:get-process-options nsdb purpose dbname)
   (sqlite3:fold-row
    ;; host port pid starttime status mtversion
    (lambda (res . row)
      (cons row res))
    '()
    nsdb
-   "SELECT host,port,pid,starttime,endtime,status,mtversion FROM processes WHERE purpose=? AND dbname LIKE ? AND status='alive';"
+   "SELECT host,port,pid,starttime,endtime,status,mtversion FROM processes WHERE purpose=? AND dbname LIKE ? AND status IN ('running','alive') ORDER BY starttime ASC,host,port;"
    purpose dbname))
 
 (define (dbfile:get-process-info nsdb host pid)
   (let ((res (sqlite3:fold-row
 	      ;; host port pid starttime status mtversion
@@ -602,17 +606,25 @@
 	      "SELECT host,port,pid,starttime,endtime,status,purpose,dbname,mtversion FROM processes WHERE host=? AND pid=?;"
 	      host pid)))
     (if (null? res)
 	#f
 	(car res))))
+
+(define (dbfile:row->procinf row)
+  (match row
+   ((host port pid starttime endtime status mtversion)
+    (make-procinf host: host port: port pid: pid starttime: starttime endtime: endtime status: status mtversion: mtversion))
+   (else
+    (debug:print 0 *default-log-port* "ERROR: row "row" did not match host,port,pid,starttime,endtime,status,mtversion")
+    #f)))
 
 (define (dbfile:set-process-done nsdb host pid reason)
-  (sqlite3:execute nsdb "UPDATE processes SET status='ended',endtime=?,reason=? WHERE host=? AND pid=?;" (current-seconds) reason host pid)
+  (sqlite3:execute nsdb "UPDATE processes SET status='done',endtime=?,reason=? WHERE host=? AND pid=?;" (current-seconds) reason host pid)
   (dbfile:cleanup-old-entries nsdb))
 
 (define (dbfile:cleanup-old-entries nsdb)
-  (sqlite3:execute nsdb "DELETE FROM process WHERE status='ended' AND endtime<?;" (- (current-seconds) (* 3600 48))))
+  (sqlite3:execute nsdb "DELETE FROM process WHERE status='done' AND endtime<?;" (- (current-seconds) (* 3600 48))))
 
 ;; other no-sync functions
 
 (define (dbfile:with-no-sync-db dbpath proc)
   (mutex-lock! *no-sync-db-mutex*)
@@ -675,20 +687,26 @@
 ;;    fails    returns  (#f lock-creation-time identifier)
 ;;    succeeds (returns (#t lock-creation-time identifier)
 ;; use (db:no-sync-del! db keyname) to release the lock
 ;;
 (define (db:no-sync-get-lock-with-id db keyname identifier)
+  (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: db: " db " keyname: " keyname " identifier: " identifier)
   (sqlite3:with-transaction
    db
    (lambda ()
      (condition-case
       (let* ((curr-val (db:no-sync-get/default db keyname #f)))
+        (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: curr-val: " curr-val)
 	(if curr-val
 	    (match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier
 	       ((timestamp . ident)
 		(cons (equal? ident identifier) timestamp))
-	       (else (cons #f 'malformed-lock)))  ;; lock malformed
+	       (else 
+                (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: malformed lock")
+                (cons #f 'malformed-lock)
+                )
+            )  ;; lock malformed
 	    (let ((curr-sec (current-seconds))
 		  (lock-value (if identifier
 				  (conc (current-seconds)"+"identifier)
 				  (current-seconds))))
 	      (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value)
@@ -1572,7 +1590,15 @@
 			    ;; (db:hoh-set! stmt-cache db stmt newstmth)
 			    (hash-table-set! stmt-cache stmt newstmth)
 			    newstmth))))
     (mutex-unlock! *get-cache-stmth-mutex*)
     result))
+
+;; (define *mutex-stmth-call* (make-mutex))
+;; 
+;; (define (db:with-mutex-for-stmth proc)
+;;   (mutex-lock! *mutex-stmth-call*)
+;;   (let* ((res (proc)))
+;;     (mutex-unlock! *mutex-stmth-call*)
+;;     res))
 
 )

Index: dbmod.scm
==================================================================
--- dbmod.scm
+++ dbmod.scm
@@ -31,10 +31,11 @@
 	chicken
 	data-structures
 	extras
 	files
 
+	format
 	(prefix sqlite3 sqlite3:)
 	matchable
 	posix
 	typed-records
 	srfi-1
@@ -87,19 +88,19 @@
 ;; The cachedb one-db file per server method goes in here
 ;;======================================================================
 
 ;; NOTE: the r/w is now w/r, #t=db modified by query, #f=db NOT modified by query
 (define (dbmod:with-db dbstruct run-id w/r proc params)
-  (let* ((use-mutex (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk
-			     (> *api-process-request-count* 5)) ;; when writes are happening throttle more
-			(> *api-process-request-count* 50)))
+  (let* ((use-mutex w/r) ;; (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk
+			     ;; (> *api-process-request-count* 5)) ;; when writes are happening throttle more
+			;; (> *api-process-request-count* 50)))
 	 (dbdat     (dbmod:open-db dbstruct run-id (dbfile:db-init-proc)))
 	 (dbh       (dbr:dbdat-dbh dbdat)) ;; this will be the cachedb handle
 	 (dbfile    (dbr:dbdat-dbfile dbdat)))
     ;; if nfs mode do a sync if delta > 2
-    (let* ((last-update (dbr:dbstruct-last-update dbstruct))
-	   (sync-proc   (dbr:dbstruct-sync-proc dbstruct))
+    #;(let* ((last-update (dbr:dbstruct-last-update dbstruct))
+	   ;; (sync-proc   (dbr:dbstruct-sync-proc dbstruct))
 	   (curr-secs   (current-seconds)))
       (if (> (- curr-secs last-update) 5)
 	  (begin
 	    (sync-proc last-update)
 
@@ -198,11 +199,11 @@
   (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
 	 (dbfname      (or dbfname-in (dbmod:run-id->dbfname run-id)))
 	 (dbpath       (dbmod:get-dbdir dbstruct))             ;; directory where all the .db files are kept
 	 (dbfullname   (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
 	 (dbexists     (file-exists? dbfullname))
-	 (tmpdir       (dbfile:make-tmpdir-name areapath tmpadj))
+	 (tmpdir       (common:make-tmpdir-name areapath tmpadj))
 	 (tmpdb        (let* ((fname (conc tmpdir"/"dbfname)))
 			 fname))
 	 (cachedb        (dbmod:open-cachedb-db init-proc
 					    ;; (if (eq? (dbfile:cache-method) 'cachedb)
 					    ;; 	#f
@@ -225,50 +226,21 @@
     (dbr:dbstruct-dbfname-set!   dbstruct dbfname)
     (dbr:dbstruct-sync-proc-set! dbstruct
 				 (lambda (last-update)
 				   (if *sync-in-progress*
 				       (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk")
-				       (let* ((syncer-logfile    (conc areapath"/logs/"dbfname"-syncer.log"))
-					      (sync-cmd          (if (eq? syncdir 'todisk)
-								     (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&")
-								     (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&")))
-					      (synclock-file     (conc dbfullname".lock"))
-					      (syncer-running-file (conc dbfullname"-sync-running"))
-					      (synclock-mod-time (if (file-exists? synclock-file)
-								     (handle-exceptions
-									 exn
-								       #f
-								       (file-modification-time synclock-file))
-								     #f))
-					      (thethread         (lambda ()
-								   (thread-start!
-								    (make-thread
-								     (lambda ()
-								       (set! *sync-in-progress* #t)
-								       (debug:print-info "Running "sync-cmd)
-								       (if (file-exists? syncer-running-file)
-									   (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.")
-									   (system sync-cmd))
-								       (set! *sync-in-progress* #f)))))))
-					 (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk
-					      (file-modification-time tmpdb)
-					      (file-modification-time dbfullname))
-					     (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname)
-					     (if synclock-mod-time
-						 (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file
-						     (begin
-						       (handle-exceptions
-							   exn
-							 #f
-                                                         (begin
-                                                           (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds ("  synclock-mod-time " seconds). Removing it")
-							   (delete-file synclock-file)
-                                                         )
-                                                       )
-						       (thethread))
-						     (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found."))
-						 (thethread)))))))
+				       (begin
+					 ;; turn off writes - send busy or block?
+					 ;; call db2db internally
+					 ;; turn writes back on
+					 ;;
+					 (set! *api-halt-writes* #t) ;; do we need a mutex?
+					 ;; (dbmod:db-to-db-sync src-db dest-db last-update (dbfile:db-init-proc) keys)
+					 (debug:print-info 2 *default-log-port* "Internal sync running from "tmpdb" to "dbfullname)
+					 (dbmod:db-to-db-sync tmpdb dbfullname last-update (dbfile:db-init-proc) keys)
+					 (set! *api-halt-writes* #f)
+					 ))))
     ;; (dbmod:sync-tables tables #f db cachedb)
     ;; 
     (thread-sleep! 1) ;; let things settle before syncing in needed data
     (dbmod:sync-gasket tables #f cachedb db dbfullname 'fromdest keys) ;; ) ;; load into cachedb
     (dbr:dbstruct-last-update-set! dbstruct (+ (current-seconds) -10)) ;; should this be offset back in time by one second?
@@ -856,6 +828,94 @@
 		   (res    (dbmod:sync-gasket tables last-update sdb ddb dest-db 'todisk keys)))
 	      (sqlite3:finalize! sdb)
 	      (sqlite3:finalize! ddb)
 	      res)))
       #f))
+
+;; ======================================================================
+;; dbstats
+;;======================================================================
+
+;; (define *dbstruct-dbs*         #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this.
+;; db stats
+(define *db-stats*            (make-hash-table)) ;; hash of vectors < count duration-total >
+(define *db-stats-mutex*      (make-mutex))
+
+(define (rmt:print-db-stats)
+  (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
+    (debug:print 0 *default-log-port* "DB Stats\n========")
+    (debug:print 0 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
+    (for-each (lambda (cmd)
+		(let* ((dat     (hash-table-ref *db-stats* cmd))
+		       (count   (dbstat-cnt dat))
+		       (tottime (dbstat-tottime dat)))
+		  (debug:print 0 *default-log-port*
+			       (format #f fmtstr cmd count tottime
+				       (/ tottime count)))))
+	      (sort (hash-table-keys *db-stats*)
+		    (lambda (a b)
+		      (> (dbstat-tottime (hash-table-ref *db-stats* a))
+			 (dbstat-tottime (hash-table-ref *db-stats* b))))))))
+
+(defstruct dbstat
+  (cnt 0)
+  (tottime 0))
+
+(define (db:add-stats cmd run-id params delta)
+  (let* ((modified-cmd (if (eq? cmd 'general-call)
+			   (string->symbol (conc "general-call-" (car params)))
+			   cmd))
+	 (rec          (hash-table-ref/default *db-stats* modified-cmd #f)))
+    (if (not rec)
+	(let ((new-rec  (make-dbstat)))
+	  (hash-table-set! *db-stats* modified-cmd new-rec)
+	  (set! rec new-rec)))
+    (dbstat-cnt-set! rec (+ (dbstat-cnt rec) 1))
+    (dbstat-tottime-set! rec (+ (dbstat-tottime rec) delta))))
+    
+
+
 )
+
+
+;; ATTIC
+
+					 #;(let* ((syncer-logfile    (conc areapath"/logs/"dbfname"-syncer.log"))
+					      (sync-cmd          (if (eq? syncdir 'todisk)
+								     (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&")
+								     (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&")))
+					      (synclock-file     (conc dbfullname".lock"))
+					      (syncer-running-file (conc dbfullname"-sync-running"))
+					      (synclock-mod-time (if (file-exists? synclock-file)
+								     (handle-exceptions
+									 exn
+								       #f
+								       (file-modification-time synclock-file))
+								     #f))
+					      (thethread         (lambda ()
+								   (thread-start!
+								    (make-thread
+								     (lambda ()
+								       (set! *sync-in-progress* #t)
+								       (debug:print-info "Running "sync-cmd)
+								       (if (file-exists? syncer-running-file)
+									   (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.")
+									   (system sync-cmd))
+								       (set! *sync-in-progress* #f)))))))
+					 (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk
+					      (file-modification-time tmpdb)
+					      (file-modification-time dbfullname))
+					     (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname)
+					     (if synclock-mod-time
+						 (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file
+						     (begin
+						       (handle-exceptions
+							   exn
+							 #f
+                                                         (begin
+                                                           (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds ("  synclock-mod-time " seconds). Removing it")
+							   (delete-file synclock-file)
+                                                         )
+                                                       )
+						       (thethread))
+						     (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found."))
+						 (thethread))))

Index: docs/manual/Makefile
==================================================================
--- docs/manual/Makefile
+++ docs/manual/Makefile
@@ -37,10 +37,13 @@
 #	dos2unix megatest_manual.html
 
 megatest_manual.pdf : megatest_manual.txt *.txt *png *.dot
 	a2x -a toc -f pdf megatest_manual.txt
 
+%.pdf : %.dot
+	dot -Tpdf $*.dot -o$*.pdf
+
 server.ps : server.dot
 	dot -Tps server.dot > server.ps
 
 client.ps : client.dot
 	dot -Tps client.dot > client.ps

Index: docs/manual/bisecting.png
==================================================================
--- docs/manual/bisecting.png
+++ docs/manual/bisecting.png
cannot compute difference between binary files

Index: docs/manual/megatest-test-stages.png
==================================================================
--- docs/manual/megatest-test-stages.png
+++ docs/manual/megatest-test-stages.png
cannot compute difference between binary files

Index: docs/manual/megatest_manual.html
==================================================================
--- docs/manual/megatest_manual.html
+++ docs/manual/megatest_manual.html
@@ -1,10 +1,10 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-<meta name="generator" content="AsciiDoc 9.0.0rc1">
+<meta name="generator" content="AsciiDoc 10.2.0">
 <title>The Megatest Users Manual</title>
 <style type="text/css">
 /* Shared CSS for AsciiDoc xhtml11 and html5 backends */
 
 /* Default font. */
@@ -769,13 +769,13 @@
 <h1>The Megatest Users Manual</h1>
 <span id="author">Matt Welland</span><br>
 <span id="email" class="monospaced">&lt;<a href="mailto:matt@kiatoa.com">matt@kiatoa.com</a>&gt;</span><br>
 <span id="revnumber">version 1.5,</span>
 <span id="revdate">June 2020</span>
-<div id="toc">
-  <div id="toctitle">Table of Contents</div>
-  <noscript><p><b>JavaScript must be enabled in your browser to display the table of contents.</b></p></noscript>
+<div id="toc">
+  <div id="toctitle">Table of Contents</div>
+  <noscript><p><b>JavaScript must be enabled in your browser to display the table of contents.</b></p></noscript>
 </div>
 </div>
 <div id="content">
 <div class="sect1">
 <h2 id="_preface">Preface</h2>
@@ -3016,11 +3016,11 @@
 <div class="listingblock">
 <div class="title">example for backreference (eg: item <span class="monospaced">foo23/thud</span> will imply waiton&#8217;s item <span class="monospaced">num-23/bar/thud</span></div>
 <div class="content monospaced">
 <pre>#
 # ## Example
-# ## can use \{number} in replacement pattern to backreference a (capture) from matching pattern similar to sed or perl
+# ## can use {number} in replacement pattern to backreference a (capture) from matching pattern similar to sed or perl
 [requirements]
 mode itemwait
 # itemmap &lt;item pattern for this test&gt;  &lt;item replacement pattern for waiton test&gt;
 itemmap foo(\d+)/ num-\1/bar/</pre>
 </div></div>
@@ -3758,10 +3758,10 @@
 <div id="footnotes"><hr></div>
 <div id="footer">
 <div id="footer-text">
 Version 1.5<br>
 Last updated
- 2023-01-23 11:18:29 EST
+ 2023-11-24 20:56:43 EST
 </div>
 </div>
 </body>
 </html>

Index: docs/manual/megatest_manual.pdf
==================================================================
--- docs/manual/megatest_manual.pdf
+++ docs/manual/megatest_manual.pdf
cannot compute difference between binary files

Index: docs/manual/server.dot
==================================================================
--- docs/manual/server.dot
+++ docs/manual/server.dot
@@ -12,67 +12,68 @@
 //     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 //     GNU General Public License for more details.
 // 
 //     You should have received a copy of the GNU General Public License
 //     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
 digraph G {
 
-    subgraph cluster_1 {
-        node [style=filled,shape=box];
-
-	check_available_queue       -> remove_entries_over_10s_old;
-	remove_entries_over_10s_old -> set_available [label="num_avail < 3"];
-	remove_entries_over_10s_old -> exit [label="num_avail > 2"];
-
-	set_available               -> delay_2s;
-	delay_2s          -> check_place_in_queue;
-
-	check_place_in_queue        -> "http:transport-launch" [label="at head"];
-	check_place_in_queue        -> exit [label="not at head"];
-
-	"client:login"              -> "server:shutdown" [label="login failed"];
-	"server:shutdown"           -> exit;	
-
-	subgraph cluster_2 {
-		"http:transport-launch"       -> "http:transport-run";
-		"http:transport-launch"       -> "http:transport-keep-running";
-
-		"http:transport-keep-running" -> "tests running?";
-		"tests running?"              -> "client:login" [label=yes];
-		"tests running?"              -> "server:shutdown" [label=no];
-		"client:login"                -> delay_5s [label="login ok"];
-		delay_5s                      -> "http:transport-keep-running";
-	}
-
-	// start_server -> "server_running?";
-	// "server_running?" -> set_available [label="no"];
-	// "server_running?" -> delay_2s [label="yes"];
-	// delay_2s -> "still_running?";
-	// "still_running?" -> ping_server [label=yes];
-	// "still_running?" -> set_available [label=no];
-	// ping_server -> exit [label=alive];
-	// ping_server -> remove_server_record [label=dead];
-	// remove_server_record -> set_available;
-	// set_available -> avail_delay [label="delay 3s"];
-	// avail_delay -> "first_in_queue?";
-	// 
-	// "first_in_queue?" -> set_running [label=yes];
-	// set_running -> get_next_port -> handle_requests;
-	// "first_in_queue?" -> "dead_entry_in_queue?" [label=no];
-	// "dead_entry_in_queue?" -> "server_running?" [label=no];
-	// "dead_entry_in_queue?" -> "remove_dead_entries" [label=yes];
-	// remove_dead_entries -> "server_running?";
-	// 
-	// handle_requests -> start_shutdown [label="no traffic\nno running tests"];
-	// handle_requests -> shutdown_request;
-	// start_shutdown -> shutdown_delay;
-	// shutdown_request -> shutdown_delay;
-	// shutdown_delay -> exit;
-	
-        label = "server:launch";
-        color=brown;
-    }
-
-//     client_start_server -> start_server;
-//     handle_requests -> read_write;
-//     read_write -> handle_requests;
-}
+    label = "Server Start Sequences";
+    color=brown;
+    rankdir="TB";
+
+    subgraph cluster_1 {
+	label="Find Prime Main Server";
+	
+        node [style=filled,shape=box];
+	
+	START;
+        HaveServ [label="Look at .servinfo\nfiles for prime main"]; 
+	AskPrime [label="Ask Prime for main"];
+	PingPrime [label="Ping Prime"];
+	AskPrime [label="Ask .servinfo prime for server"];
+	StartServ [label="Launch Server Process for main.db"];
+	
+      	START -> HaveServ;
+	HaveServ -> PingPrime;
+	PingPrime -> AskPrime [label="Got response"];
+	PingPrime -> StartServ [label="No reponse"];
+	HaveServ -> StartServ [label="No files"];
+	StartServ -> "Delay 2s" -> START;
+        AskPrime -> DONE;
+    }
+    
+    subgraph cluster_2 {
+	label="Starting non-prime server"
+        node [style=filled,shape=box];
+	StartTCPServer [label="Start tcp server"];
+	FindPrimeMain [label="Find Prime Main Server"];
+	RegisterProcessViaPrime [label="Register process via prime server"];
+
+	StartTCPServer -> FindPrimeMain -> START;
+	DONE -> RegisterProcessViaPrime -> READY;
+    }
+
+    subgraph cluster_3 {
+	label="Start Prime Main"
+        node [style=filled,shape=box];
+	StartTCPServer_prime [label="Start tcp server"];
+	GetServInfoFiles [label="Get servinfo files"];
+	CreateServInfoFile [label="Create servinfo file"];
+	RegisterProcess [label="Register process in no-sync (direct access)"];
+	ValidateServInfoFiles [label="Validate servinfo files with ping\nremove any files which do not respond to ping"];
+
+	CheckHost [label="Verify that current host matches\nexisting servinfo files host"]
+	StartTCPServer_prime -> GetServInfoFiles;
+	GetServInfoFiles -> CreateServInfoFile [label="No servinfo\nfiles"];
+	GetServInfoFiles -> ValidateServInfoFiles;
+	ValidateServInfoFiles -> CreateServInfoFile [label="No valid files"];
+	CreateServInfoFile -> GetServInfoFiles [label="servinfo file created"];
+	KeepRunning [label="READY"];
+		     
+	ValidateServInfoFiles -> CheckHost;
+	CheckHost -> RegisterProcess [label="Have valid\nservinfo files and same host"];
+	RegisterProcess -> KeepRunning;
+	CheckHost -> EXIT [label="Not same host"];	
+    }
+}
+

Index: docs/manual/server.png
==================================================================
--- docs/manual/server.png
+++ docs/manual/server.png
cannot compute difference between binary files

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -236,11 +236,10 @@
 
     (let loop ((minutes   (calc-minutes))
 	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 	       (disk-free (get-df (current-directory)))
                (last-sync (current-seconds)))
-      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
       (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
              (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                    (delta (abs (- load cpu-load))))
                               (if (> delta 0.1) ;; don't bother updating with small changes
                                   load
@@ -256,15 +255,16 @@
              (do-sync       (or new-cpu-load new-disk-free over-time))
 
              (test-info   (rmt:get-test-state-status-by-id run-id test-id))
              (state       (car test-info));; (db:test-get-state test-info))
              (status      (cdr test-info));; (db:test-get-status test-info))
+	     (killreq     (equal? state "KILLREQ"))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
         ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
-         ((test-get-kill-request run-id test-id)
+         (killreq
           (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
           (set! kill-job? #t))
          ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
           (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
           (set! kill-job? #t))
@@ -276,16 +276,11 @@
 
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
         (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
 	    (launch:handle-zombie-tests run-id))
         (when do-sync
-          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
-          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
-          ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
-          ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
-	  )
+          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
         
 	(if kill-job? 
 	    (begin
               (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
 	      (mutex-lock! m)
@@ -331,18 +326,17 @@
 		      )))
 	      (mutex-unlock! m)
 	      ;; no point in sticking around. Exit now. But run end of run before exiting?
               (launch:end-of-run-check run-id)
 	      (exit)))
-	(if (hash-table-ref/default misc-flags 'keep-going #f)
+	(if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
 	    (begin
-	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
-	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
-		  (loop (calc-minutes)
-                        (or new-cpu-load cpu-load)
-                        (or new-disk-free disk-free)
-                        (if do-sync (current-seconds) last-sync)))))))
+	      (thread-sleep! 6) ;; was 3
+	      (loop (calc-minutes)
+		    (or new-cpu-load cpu-load)
+		    (or new-disk-free disk-free)
+		    (if do-sync (current-seconds) last-sync))))))
     (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
 
 
 (define (launch:execute encoded-cmd)
   (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 ;; (declare (unit megatest-version))
 
-(define megatest-version 1.8017)
+(define megatest-version 1.8022)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -968,12 +968,13 @@
 	   (tl         (launch:setup))
 	   (keys       (keys:config-get-fields *configdat*)))
       (case (rmt:transport-mode)
 	((tcp)
 	 (let* ((timeout    (server:expiration-timeout)))
-	   (debug:print 0 *default-log-port* "INFO: Running using tcp method with server timeout of "timeout)
+	   (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout)
 	   (tt-server-timeout-param timeout)
+	   (thread-start! (make-thread api:print-db-stats "print-db-stats"))
 	   (if dbfname
 	       (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)
 	       (begin
 		 (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.")
 		 (exit 1)))))
@@ -1091,10 +1092,14 @@
               sfiles
             )
           )
        )
        dbfiles
+     )
+     ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id.
+     (if (file-exists? (conc *toppath* "/.mtdb/no-sync.db"))
+       (delete-file (conc *toppath* "/.mtdb/no-sync.db"))
      )
      (set! *didsomething* #t)
      (exit)  
   )
 )
@@ -2132,13 +2137,13 @@
              (exit 1)))
          (if (common:file-exists? (conc  *toppath* "/megatest.db"))
              (begin  
                (debug:print-info 1 *default-log-port* "File " (conc  *toppath* "/megatest.db") " already exists. Please remove it before trying to replicate db")
                (exit 1)))
-         (if (and (common:get-db-tmp-area) (> (length (directory   (common:get-db-tmp-area) #f)) 0))
+         (if (and (common:make-tmpdir-name *toppath* "") (> (length (directory   (common:make-tmpdir-name *toppath* "") #f)) 0))
            (begin
-           (debug:print-info 1 *default-log-port* (common:get-db-tmp-area) " not empty. Please remove it before trying to replicate db")
+           (debug:print-info 1 *default-log-port* (common:make-tmpdir-name *toppath* "") " not empty. Please remove it before trying to replicate db")
            (exit 1)))    
           ;; check if timestamp 
           (let* ((source (args:get-arg "-source"))
                 (src     (if (not (equal? (substring source 0 1) "/"))
                              (conc (current-directory) "/" source)

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -68,82 +68,56 @@
 
 ;;======================================================================
 
 (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
 
-;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
-;;
-(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
+;; how to make area-dat
+(define (rmt:set-ttdat areapath ttdat)
+  (if ttdat
+      ttdat
+      (let* ((newremote  (make-and-init-remote areapath)))
+	(set! *ttdat* newremote)
+	newremote)))
+
+;; NB// area-dat replaced by ttdat
+;; 
+(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f))
   (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
-
-  (if (not (eq? (rmt:transport-mode) 'nfs))
-      (begin
-	(if (> attemptnum 2)
-	    (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
-	
-	(cond
-	 ((> attemptnum 2) (thread-sleep! 0.05))
-	 ((> attemptnum 10) (thread-sleep! 0.5))
-	 ((> attemptnum 20) (thread-sleep! 1)))
-	
-	;; I'm turning this off, it may make sense to move it
-	;; into http-transport-handler
-	(if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
-	    (begin
-	      (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.")
-	      (case (rmt:transport-mode)
-		((http)
-		 (server:run *toppath*)
-		 (thread-sleep! 3))
-		(else
-		 (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server
-		 ))))))
-  
-  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
-  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
-  ;; 3. do the query, if on homehost use local access
-  ;;
   (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
          (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
-	 (runremote     (or area-dat
-			    *runremote*))
          (attemptnum    (+ 1 attemptnum))
-	 (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))
+	 (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*))
 	 (testsuite     (common:get-testsuite-name))
-	 (mtexe         (common:find-local-megatest)))
-
-    (case (rmt:transport-mode)
-      ((http)(http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode))
-      ((tcp) (tcp-transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
-      ((nfs) (nfs-transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
-      )))
-
-(define (nfs-transport-handler  runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
-  (let* ((keys     (common:get-fields *configdat*))
-	 (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard")))
-    (api:dispatch-request dbstruct cmd run-id params)))
-	 
-(define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
-  (if (not runremote)
-      (let* ((newremote  (make-and-init-remote areapath)))
-	(set! *runremote* newremote)
-	(set! runremote newremote)))
-  (let* ((dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) ;;(dbfile:run-id->path areapath run-id)))
-    (tt:handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))
-	
-(define (rmt:print-db-stats)
-  (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
-    (debug:print 18 *default-log-port* "DB Stats\n========")
-    (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
-    (for-each (lambda (cmd)
-		(let ((cmd-dat (hash-table-ref *db-stats* cmd)))
-		  (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0))))))
-	      (sort (hash-table-keys *db-stats*)
-		    (lambda (a b)
-		      (> (vector-ref (hash-table-ref *db-stats* a) 0)
-			 (vector-ref (hash-table-ref *db-stats* b) 0)))))))
-
+	 (mtexe         (common:find-local-megatest))
+	 (dbfname       (conc (dbfile:run-id->dbnum run-id)".db"))
+	 (ttdat         (rmt:set-ttdat areapath ttdat))
+	 (conn          (tt:get-conn ttdat dbfname))
+	 (is-main       (equal? dbfname "main.db")) ;; why not (not run-id) ?
+	 (server-start-proc (if is-main
+				#f
+				(lambda ()
+				  ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname)
+				  (rmt:start-server ;; tt:server-process-run
+				   areapath
+				   testsuite ;; (dbfile:testsuite-name)
+				   mtexe
+				   run-id)))))
+    ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it
+    ;; and if there is no conn we first send a request to the main.db server to start a
+    ;; server for the dbfname.
+    #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request
+	(begin
+	  (server-start-proc)
+	  (thread-sleep! 1)))
+    (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)))
+
+;; KEEP THIS HERE FOR A WHILE, WE MAY WANT TO RESURECT IT
+;; (define (nfs-transport-handler  runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
+;;   (let* ((keys     (common:get-fields *configdat*))
+;; 	 (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard")))
+;;     (api:dispatch-request dbstruct cmd run-id params)))
+	
 (define (rmt:get-max-query-average run-id)
   (mutex-lock! *db-stats-mutex*)
   (let* ((runkey (conc "run-id=" run-id " "))
 	 (cmds   (filter (lambda (x)
 			   (substring-index runkey x))
@@ -167,11 +141,11 @@
     (mutex-unlock! *db-stats-mutex*)
     res))
 
 (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
   (let* ((qry-is-write    (not (member cmd api:read-only-queries)))
-	 (db-file-path    (db:dbfile-path)) ;;  0))
+	 (db-file-path    (common:make-tmpdir-name *toppath* "")) ;;  0))
 	 (dbstructs-local (db:setup #t))
 	 (read-only       (not (file-write-access? db-file-path)))
 	 (start           (current-milliseconds))
 	 (resdat          (if (not (and read-only qry-is-write))
 			      (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params))))
@@ -204,11 +178,11 @@
 	  ;; (rmt:update-db-stats run-id cmd params duration)
 	  ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
 	  (if qry-is-write
 	      (let ((start-time (current-seconds)))
 		(mutex-lock! *db-multi-sync-mutex*)
-/		(set! *db-last-access* start-time)  ;; THIS IS PROBABLY USELESS? (we are on a client)
+		(set! *db-last-access* start-time)  ;; THIS IS PROBABLY USELESS? (we are on a client)
                 (mutex-unlock! *db-multi-sync-mutex*)))))
     res))
 
 ;;======================================================================
 ;;
@@ -221,12 +195,12 @@
 ;;======================================================================
 
 (define (rmt:kill-server run-id)
   (rmt:send-receive 'kill-server run-id (list run-id)))
 
-(define (rmt:start-server run-id)
-  (rmt:send-receive 'start-server 0 (list run-id)))
+(define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server
+  (rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id)))
 
 ;;======================================================================
 ;;  M I S C
 ;;======================================================================
 
@@ -235,16 +209,16 @@
 
 ;; This login does no retries under the hood - it acts a bit like a ping.
 ;; Deprecated for nmsg-transport.
 ;;
 ;; (define (rmt:login-no-auto-client-setup runremote)
-;;   (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature))))
+;;   (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature))))
 
 
 ;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host
 (define (rmt:get-latest-host-load hostname)
-  (rmt:send-receive 'get-latest-host-load 0 (list hostname)))
+  (rmt:send-receive 'get-latest-host-load #f (list hostname)))
 
 (define (rmt:sdb-qry qry val run-id)
   ;; add caching if qry is 'getid or 'getstr
   (rmt:send-receive 'sdb-qry run-id (list qry val)))
 

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -2064,11 +2064,11 @@
     ;;
     ;; There is now a single call to runs:update-all-test_meta and this 
     ;; per-test call is not needed. Given the delicacy of the move to 
     ;; v1.55 this code is being left in place for the time being.
     ;;
-    (if (not (hash-table-ref/default *test-meta-updated* test-name #f))
+    (if (not (hash-table-exists? *test-meta-updated* test-name))
         (begin
           (hash-table-set! *test-meta-updated* test-name #t)
           (runs:update-test_meta test-name test-conf)))
     
     ;; itemdat => ((ripeness "overripe") (temperature "cool") (season "summer"))

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -728,11 +728,11 @@
 ;; 	#t
 ;; 	#f)))
 
 ;; timeout is hms string: 1h 5m 3s, default is 1 minute
 ;; This is currently broken. Just use the number of hours with no unit.
-;; Default is 60 seconds.
+;; Default is 600 seconds.
 ;;
 (define (server:expiration-timeout)
   (let* ((tmo (configf:lookup *configdat* "server" "timeout")))
     (if (string? tmo)
 	(let* ((num (string->number tmo)))

Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -84,11 +84,11 @@
 	     (tasks:open-db numretries (- numretries 1)))
 	   (begin
 	     (print-call-chain (current-error-port))
 	     (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
 	     (debug:print 5 *default-log-port* " exn=" (condition->list exn))))
-       (let* ((dbpath        (db:dbfile-path )) ;; (tasks:get-task-db-path))
+       (let* ((dbpath        (common:make-tmpdir-name *toppath* "")) ;; (tasks:get-task-db-path))
 	      (dbfile       (conc dbpath "/monitor.db"))
 	      (avail        (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
 	      (exists       (common:file-exists? dbpath))
 	      (write-access (file-write-access? dbpath))
 	      (mdb          (cond ;; what the hek is *toppath* doing here?

Index: tcp-transportmod.scm
==================================================================
--- tcp-transportmod.scm
+++ tcp-transportmod.scm
@@ -111,10 +111,11 @@
 ;;
 (define tt-server-timeout-param (make-parameter 600))
 
 ;; make ttdat visible
 (define *server-info* #f)
+(define *server-run*  #t)
 
 (define (tt:make-remote areapath)
   (make-tt areapath: areapath))
 
 ;; 1 ... or #f
@@ -125,33 +126,43 @@
   (and (or (number? run-id)
 	   (not run-id))
        (equal? (dbfile:run-id->dbfname run-id) dbfname)))
 
 (tcp-buffer-size 2048)
-;; (max-connections 4096) 
+;; (max-connections 4096)
+
+(define (tt:get-conn ttdat dbfname)
+  (hash-table-ref/default (tt-conns ttdat) dbfname #f))
 
 ;; do all the busy work of finding and setting up conn for
 ;; connecting to a server
 ;; 
-(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
+(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)
   (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
-  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
-	 (server-start-proc (lambda ()
-			      (tt:server-process-run
-			       (tt-areapath ttdat)
-			       testsuite ;; (dbfile:testsuite-name)
-			       (common:find-local-megatest)
-			       run-id))))
+  (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
+  (let* ((conn              (tt:get-conn ttdat dbfname))
+	 (server-start-proc (or server-start-proc
+				(lambda ()
+				  (assert (equal? dbfname "main.db") ;; only main.db is started here
+					  "FATAL: called server-start-proc for db other than main.db")
+				  (tt:server-process-run
+				   (tt-areapath ttdat)
+				   testsuite ;; (dbfile:testsuite-name)
+				   (common:find-local-megatest)
+				   run-id)))))
     (if conn
 	(begin 
-          ; (debug:print-info 0 *default-log-port* "already connected to the server")
+          (debug:print-info 2 *default-log-port* "already connected to a server")
            conn) ;; we are already connected to the server
-	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
+	(let* ((sdats (tt:get-server-info-sorted ttdat dbfname))
+	       (sdat  (if (null? sdats)
+			  #f
+			  (car sdats))))
 	  (match sdat
 	    ((host port start-time server-id pid dbfname2 servinffile)
 	     (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
-             ;(debug:print-info 0 *default-log-port* "in match servinffile:" servinffile)
+             (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
 	     (let* ((host-port (conc host":"port))
 		    (conn (make-tt-conn
 			   host: host
 			   port: port
 			   host-port: host-port
@@ -162,36 +173,52 @@
 			   pid: pid)))
 	       ;; verify we can talk to this server
 	       (let* ((result   (tt:timed-ping host port server-id))
 		      (ping-res (car result))
 		      (ping     (cdr result)))
-                 (debug:print-info 0 *default-log-port* "ping time: " ping)
+                 (debug:print-info 2 *default-log-port* "host " host " port " port " ping time: " ping " result " ping-res)
 		 (case ping-res
 		   ((running)
+                    (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table")
 		    (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good?
 		    conn)
 		   ((starting)
 		    (thread-sleep! 0.5)
-		    (tt:client-connect-to-server ttdat dbfname run-id testsuite))
+                    (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect")
+		    (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))
 		   (else
 		    (let* ((curr-secs (current-seconds)))
+		      (if (not ping-res) ;; the server is actually dead, remove the .servinfo file
+			  (begin
+			    (debug:print-info 0 *default-log-port* "Unreachable server at "
+					      host":"port" with servinfo file "servinffile", removing it")
+			    (if (file-exists? servinffile)
+				(handle-exceptions
+				 exn
+				 #f
+				 (delete-file servinffile)))))
 		      ;; rm the (last server) would go here
 		      (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
 			  (begin
 			    (tt-last-serv-start-set! ttdat curr-secs)
-			    (server-start-proc))) ;; start server if 30 sec since last attempt
+			    (server-start-proc))) ;; start server if 10 sec since last attempt
 		      (thread-sleep! 1)
-		      (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
+                      (debug:print-info 0 *default-log-port* "server ping result was "ping-res" neither running nor starting. Retrying connect")
+		      (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
 	    (else ;; no good server found, if haven't started server in > 5 secs, start another
-	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; BUG - grow this number really do not want to swamp the machine with servers
+	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
 		 (begin
-		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
+		   (debug:print-info 0 *default-log-port* "Starting server for "dbfname)
 		   (server-start-proc)
-		   (tt-last-serv-start-set! ttdat (current-seconds))))
+		   (tt-last-serv-start-set! ttdat (current-seconds))
+                   (thread-sleep! 3)
+                   ))
 	     (thread-sleep! 1)
-	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
+             (debug:print-info 0 *default-log-port* "Connect to server for " dbfname)
+	     (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
 
+;; returns ( result . ping_time )
 (define (tt:timed-ping host port server-id)
   (let* ((start-time (current-milliseconds))
 	 (result     (tt:ping host port server-id)))
     (cons result (- (current-milliseconds) start-time))))
     
@@ -222,18 +249,18 @@
 
 ;; client side handler
 ;;
 ;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
 ;;
-(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
-  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
-  (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
+(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)
+  ;; connect-to-server will start a server if needed.
+  (let* ((areapath (tt-areapath ttdat))
+	 (conn     (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname
     (if conn
 	;; have connection, call the server
 	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
 	  ;; res is (status errmsg result meta)
-         ; (debug:print 0 *default-log-port* "conn:" conn " res: " res)
 	  (match res
 	    ((status errmsg result meta)
 	     (if (list? meta)
 		 (let* ((delay-wait (alist-ref 'delay-wait meta)))
 		   (if (and (number? delay-wait)
@@ -241,35 +268,36 @@
 		       (begin
 			 (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
 			 (thread-sleep! delay-wait)))))
 	     (case status
 	       ((busy) ;; result will be how long the server wants you to delay
-		(let* ((dly  (if (number? result) result 0.1)))
-		  (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, will try again in "dly" seconds.")
+		(let* ((raw-dly  (if (number? result) result 0.1))
+		       (dly      (+ raw-dly (/ attemptnum 10)))) ;; (* raw-dly (/ attemptnum 2))))
+		  (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1))
 		  (thread-sleep! dly)
-		  (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))
+		  (tt:handler  ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)))
 	       ((loaded)
 		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
 		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
-		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe))
 	       (else
 		result)))
 	    (else ;; did not receive properly formated result
-	     (if (not res) ;; tt:handler is telling us that communication failed
+	     (if (not res) ;; tt:send-receive telling us that communication failed
 		 (let* ((host    (tt-conn-host conn))
 			(port    (tt-conn-port conn))
 			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
 			(pid     (tt-conn-pid  conn))
                         ;;(servinf (tt-conn-servinf-file conn))) 
 			(servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath)
-		   (hash-table-set! (tt-conns ttdat) dbfname #f)
+		   (hash-table-set! (tt-conns ttdat) dbfname #f) ;; clear out the conn for this dbfname to force finding new server
 		   (if (and servinf (file-exists? servinf))
 		       (begin
 			 (if (< attemptnum 10)
 			     (begin
 			       (thread-sleep! 0.5)
-			       (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+			       (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
 			     (begin
 			       (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
 			       (if (and (file-exists? servinf)
 					(> (- (current-seconds)(file-modification-time servinf)) 60))
 				   (begin
@@ -276,33 +304,30 @@
 				     (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
 				     (handle-exceptions
 					 exn
 				       #f
 				       (delete-file* servinf))
-				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
 				   (begin
 				     ;; start server - addressed in client-connect-to-server
 				     ;; delay        - addressed in client-connect-to-server
 				     ;; try again
 				     (thread-sleep! 0.25) ;; dunno, I think this needs to be here
-				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
 				   ))))
 		       (begin ;; no server file, delay and try again
-			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
+			 (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ")
 			 (thread-sleep! 0.5)
-			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
+			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))))
 		 (begin ;; this case is where res is malformed. Probably should abort
 		   (assert #f "FATAL: tt:handler received bad data "res)
 		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
-		   ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)
+		   ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)
 		   )))))
 	(begin
 	  (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again
-	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))
-
-(define (tt:bid-for-servership run-id)
-  #f)
+	  (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)))))
 
 ;; gets server info and appends path to server file
 ;; sorts by age, oldest first
 ;;
 ;; returns list of (host port startseconds server-id servinfofile)
@@ -325,24 +350,10 @@
 	   (debug:print 2 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", ")))
        (set! count (+ count 1)))
      sorted)
     sorted))
     
-(define (tt:get-current-server-info ttdat dbfname)
-  (assert (tt-areapath ttdat) "FATAL: areapath not set in ttdat.")
-  ;;
-  ;; TODO - replace most of below with tt;get-server-info-sorted
-  ;;
-  (let* ((areapath (tt-areapath ttdat))
-	 (sfiles   (tt:find-server areapath dbfname))
-	 (sdats    (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read
-	 (sorted   (sort sdats (lambda (a b)
-				 (< (list-ref a 2)(list-ref b 2))))))
-    (if (null? sorted)
-	#f  ;; we'll want to wait until extra servers have exited
-	(car sorted))))
-
 (define (tt:send-receive ttdat conn cmd run-id params)
   (let* ((host-port (tt-conn-host-port conn)) ;; (conc (tt-conn-host conn)":"(tt-conn-port conn)))
 	 (host      (tt-conn-host conn))
 	 (port      (tt-conn-port conn))
 	 (dat       (list cmd run-id params #f))) ;; no meta data yet
@@ -457,244 +468,207 @@
 ;; start the listener and start responding to requests
 ;;
 ;; NOTE: organise by dbfname, not run-id so we don't need
 ;;       to pull in more modules
 ;;
-;; This is the routine called in megatest.scm to start a server
+;; This is the routine called in megatest.scm to start a server. NOTE: sequence is different for main.db vs. X.db
 ;;
 ;; Server viability is checked in keep-running. Blindly start and run here.
 ;;
 (define (tt:start-server areapath run-id dbfname-in handler keys)
   (assert areapath "FATAL: areapath not provided for tt:start-server")
-  ;; is there already a server for this dbfile? Then exit.
   (let* ((ttdat   (make-tt areapath: areapath))
-	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
-	 (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
-    (if (> (length servers) 4)
-	(begin
-	  (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
-	  (exit))
-	(let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
-	  (tt-handler-set! ttdat (handler dbstruct))
-	  (let* ((tcp-thread (make-thread
-			      (lambda ()
-				(tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
-			      "tcp-server-thread"))
-		 (run-thread (make-thread
-			      (lambda ()
-				(tt:keep-running ttdat dbfname dbstruct)))))
-	    (thread-start! tcp-thread)
-	    (thread-start! run-thread)
-
-	    (let* ((areapath     (tt-areapath ttdat))
-		   (nosyncdbpath (conc areapath"/.mtdb")))
-	      ;; this didn't seem to work, is port not available yet?
-	      (let loop ((count 0))
-		(if (tt-port ttdat)
-		    (begin
-		      (procinf-port-set! *procinf* (tt-port ttdat))
-		      (procinf-dbname-set! *procinf* dbfname)
-		      (dbfile:with-no-sync-db
-		       nosyncdbpath
-		       (lambda (nsdb)
-			 (dbfile:insert-or-update-process nsdb *procinf*))))
-		    (if (< count 5)
-			(begin
-			  (thread-sleep! 0.5)
-			  (loop (+ count 1)))
-			(debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set!"))))
-	    
-	      (thread-join! run-thread) ;; run thread will exit on timeout or other conditions
-	      ;; replace with call to (dbfile:set-process-done nsdb host pid reason)
-	      (procinf-status-set! *procinf* "done")
-	      (procinf-end-set! *procinf* (current-seconds))
-	      (dbfile:with-no-sync-db
-	       nosyncdbpath
-	       (lambda (nsdb)
-		 (dbfile:insert-or-update-process nsdb *procinf*)))
-              (debug:print 0 *default-log-port* "Exiting now.")
-	      (exit)))))))
+	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))))
+    (set! *server-info* ttdat)
+    (let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
+      (tt-handler-set! ttdat (handler dbstruct))
+      (let* ((servinf-created #f)
+	     (tcp-thread      (make-thread
+			       (lambda ()
+				 ;; NOTE: tt-port and tt-host are set in connect-listener which is called under tt:start-tcp-server
+				 (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
+			       "tcp-server-thread"))
+	     (run-thread      (make-thread
+			       (lambda ()
+				 (tt:keep-running ttdat dbfname dbstruct)))))
+	(thread-start! tcp-thread)
+
+	(let* ((areapath     (tt-areapath ttdat))
+	       (nosyncdbpath (conc areapath"/.mtdb"))
+	       (servers      ;; (tt:find-server areapath dbfname)))
+		(tt:get-server-info-sorted ttdat dbfname)) ;; (host port startseconds server-id servinfofile)
+	       (good-srvrs  
+		;; contact servers via ping, if no response remove the .servinfo file
+		(let loop ((servrs     servers)
+			   (prime-host #f)
+			   (result    '()))
+		  (if (null? servrs)
+		      (reverse result)
+		      (let* ((servdat (car servrs)))
+			(match servdat
+			     ((host port startseconds server-id servinfofile)
+			      (let* ((ping-res  (tt:timed-ping host port server-id))
+				     (good-ping (match ping-res
+						   ((result . ping-time)
+						    (not result)) ;; we couldn't reach the server or it was not a megatest server
+						   (else #f))) ;; the ping failed completely?
+				     (same-host (or (not prime-host) ;; i.e. this is the first host
+						    (equal? prime-host host)))
+				     (keep-srv  (and good-ping same-host)))
+				(if keep-srv	
+				    (loop (cdr servrs)
+					  host
+					  (cons servdat result))
+				    (begin
+				      (handle-exceptions
+				       exn
+				       (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile", "
+							 (condition->list exn))
+				       (delete-file* servinfofile))
+				      (loop (cdr servrs) prime-host result)))))
+			     (else
+			      ;; can't delete it as we don't have a filename. NOTE: Should really never get here.
+			      (debug:print-info 0 *default-log-port* "ERROR: bad servinfo record \""servdat"\"")
+			      (loop (cdr servrs) prime-host result)) ;; drop 
+			     )))))
+	       (home-host (if (null? good-srvrs)
+			      #f
+			      (caar good-srvrs))))
+	  ;; by here we have a trustworthy list of servers and we have removed the .servinfo file for any unresponsive servers
+	  ;; and the list is in good-srvrs
+	  (cond
+	   ((not home-host) ;; no servers yet, go ahead and start
+	    (debug:print-info 0 *default-log-port* "No servers yet, starting on "(get-host-name)))
+	   ((> (length good-srvrs) 2) ;; don't need more, just exit
+	    (debug:print-info 0 *default-log-port* "Have "(length good-srvrs)", no need for more, exiting.")
+	    (exit))
+	   ((not (equal? home-host (get-host-name))) ;; there is a home-host and we are not on it
+	    (debug:print-info 0 *default-log-port* "Prime main server is on host "home-host", but we are on host "(get-host-name)", exiting.")
+	    (exit))
+	   (else
+	    (debug:print-info 0 *default-log-port* "Starting on host "(get-host-name)", along with "(length good-srvrs)" other servers.")))
+
+	  ;; this didn't seem to work, is port not available yet?
+	  (let loop ((count 0))
+	    (if (tt-port ttdat)
+		(begin
+		  (procinf-port-set! *procinf* (tt-port ttdat))
+		  (procinf-dbname-set! *procinf* dbfname)
+		  (dbfile:with-no-sync-db
+		   nosyncdbpath
+		   (lambda (nsdb)
+		     (dbfile:insert-or-update-process nsdb *procinf*))))
+		(if (< count 10)
+		    (begin
+		      (thread-sleep! 0.25)
+		      (loop (+ count 1)))
+		    (begin
+		      (debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set! Exiting.")
+		      (exit)))))
+	  
+	  ;; create a servinfo file start keep-running
+	  (tt:create-server-registration-file ttdat dbfname)
+	  (procinf-status-set! *procinf* "running")
+	  (tt-state-set! ttdat 'running)
+	  (dbfile:with-no-sync-db
+	   nosyncdbpath
+	   (lambda (nsdb)
+	     (dbfile:insert-or-update-process nsdb *procinf*)))
+	  (thread-start! run-thread)
+
+	  (thread-join! run-thread) ;; run thread will exit on timeout or other conditions
+	  
+	  ;; (tcp-close (tt-socket ttdat)) ;; close up ports here
+
+	  ;; replace with call to (dbfile:set-process-done nsdb host pid reason)
+	  (procinf-status-set! *procinf* "done")
+	  (procinf-end-set! *procinf* (current-seconds))
+	  ;; either convert this to use set-process-done or get rid of set-process-done
+	  (dbfile:with-no-sync-db
+	   nosyncdbpath
+	   (lambda (nsdb)
+	     (dbfile:insert-or-update-process nsdb *procinf*)))
+	  (debug:print 0 *default-log-port* "Exiting now.")
+	  (exit))))))
 
 (define (tt:keep-running ttdat dbfname dbstruct)
-  ;; verfiy conn for ready
-  ;; listener socket has been started by this stage
-  ;; wait for a port before creating the registration file
-  ;;
-  (let* ((db-locked-in #f)
-	 (areapath     (tt-areapath ttdat))
-	 (nosyncdbpath (conc areapath"/.mtdb"))
-	 (cleanup (lambda ()
-		    (if (tt-cleanup-proc ttdat)
-			((tt-cleanup-proc ttdat)))
-		    (dbfile:with-no-sync-db nosyncdbpath
-					    (lambda (db)
-					      (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
-						(debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
-						(db:no-sync-del! db dbfname)
-						#;(if dbtmpname
-						    (delete-file dbtmpname))))))))
-    (set! *server-info* ttdat)
-    (let loop ((count 0))
-      (if (> count 240)
-	  (begin
-	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
-	    (exit 1))
-	  (if (not (tt-port ttdat)) ;; no connection yet
-	      (begin
-		(thread-sleep! 0.25)
-		(loop (+ count 1))))))
-    
-    (tt:create-server-registration-file ttdat dbfname)
-    ;; now start watching the last-access, if it hasn't been touched
-    ;; in over ten seconds we exit
-    (thread-sleep! 0.05) ;; any real need for delay here?
+  
+  ;; at this point the server is running and responding to calls, we just monitor
+  ;; for db calls and exit if there are none.
+
+  ;; if I am not in the first 3 servers, exit
+  (let* ((start-time (current-seconds)))
     (let loop ()
-      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
-	     (ok      (cond
-		       ((null? servers) #f) ;; not ok
-		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
-				(tt-servinf-file ttdat))
-			(let* ((res (if db-locked-in
-					#t
-					(let* ((lock-result  ;; this is the primary lock - need to double verify that got it
-						(dbfile:with-no-sync-db
-						 nosyncdbpath
-						 (lambda (db)
-						   (db:no-sync-lock-and-check db dbfname
-									      (tt-servinf-file ttdat)
-									      ;; (dbr:dbstruct-dbtmpname dbstruct)
-									      ))))
-					       (success (car lock-result)))
-					  (if success
-					      (begin
-						(tt-state-set! ttdat 'running)
-						(debug:print 0 *default-log-port* "Got server lock for " dbfname)
-						(set! db-locked-in #t)
-						#t)
-					      (begin
-						(debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
-						#f))))))
-			  (if (and res (common:low-noise-print 120 "top server message"))
-			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
-						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
-			  res))
-		       (else
-			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
-			(let* ((leadsrv (car servers)))
-			  (match leadsrv
-			    ((host port startseconds server-id pid dbfname servinfofile)
-			     (let* ((result  (tt:timed-ping host port server-id))
-				    (res     (car result))
-				    (ping    (cdr result)))
-			       (debug:print-info 0 *default-log-port* "Ping to "host":"port", with server-id "server-id
-						 ", and file "servinfofile" returned "res)
-			       (if res
-				   #f ;; not the server, but all good, want to exit
-				   (if (and (file-exists? servinfofile)
-					  (> (- (current-seconds)(file-modification-time servinfofile)) 30))
-				     (begin
-				       ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it
-				       (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
-                                       (handle-exceptions
-                                        exn
-                                        (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile)
-				        (delete-file* servinfofile)
-                                       )
-				       #t) ;; not the server but the server is not reachable
-				     (begin
-				       (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.")
-				       (thread-sleep! 1) ;; just because
-				       #t)))))
-			    (else ;; should never get here
-			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
-			     (assert #f "Bad server record "leadsrv))))))))
+      (let* ((servers   (tt:get-server-info-sorted ttdat dbfname))
+	     (home-host (if (null? servers)
+			    #f
+			    (caar servers)))
+	     (my-index  (list-index (lambda (x)
+				      (equal? (list-ref x 6)
+					      (tt-servinf-file ttdat)))
+				    servers))
+	     (ok         (cond
+			  ((not *server-run*)
+			   (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.")
+			   #f)
+			  ((null? servers)
+			   (debug:print 0 *default-log-port* "WARNING: no servinfo files found, this cannot be.")
+			   #f) ;; not ok
+			  ((> my-index 2)
+			   (debug:print 0 *default-log-port* "WARNING: there are more than two servers ahead of me, I'm not needed, exiting.")
+			   #f) ;; not ok to not be in first three
+			  ((eq? (tt-state ttdat) 'running) #t) ;; we are good to keep going
+			  ((> (- (current-seconds) start-time) 30)
+			   (debug:print 0 *default-log-port* "WARNING: over 30 seconds and not yet in runnning mode. Exiting.")
+			   #f)
+			  (else #t))))
 	(if ok
 	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
 	    (begin
 	      (debug:print 0 *default-log-port* "Exiting immediately")
-	      (cleanup)
+	      (tt:shutdown-server ttdat)
 	      (exit)))
 
 	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
 	       (curr-secs   (current-seconds)))
 	  (if (and (eq? (tt-state ttdat) 'running)
-		   (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db?
-	      (begin
-		(set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds))
+		   (> (- curr-secs last-update) 5)) ;; every 5 seconds update the db?
+	      (let* ((sinfo-file (tt-servinf-file ttdat)))
+		;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file)
+		(set! (file-modification-time sinfo-file) (current-seconds))
 		((dbr:dbstruct-sync-proc dbstruct) last-update)
 		(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
-	  
+	
 	(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
 	    (begin
 	      (thread-sleep! 5)
 	      (loop)))))
-    (cleanup)
+    (tt:shutdown-server ttdat)
     (debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running.")))
 
-  
-;; ;; given an already set up uconn start the cmd-loop
-;; ;;
-;; (define (tt:cmd-loop ttdat)
-;;   (let* ((serv-listener (-socket uconn))
-;; 	 (listener      (lambda ()
-;; 			  (let loop ((state 'start))
-;; 			    (let-values (((inp oup)(tcp-accept serv-listener)))
-;; 			      ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP
-;; 			      (let* ((rdat  (deserialize inp)) ;; '(my-host-port qrykey cmd params)
-;; 				     (resp  (ulex-handler uconn rdat)))
-;; 				(serialize resp oup)
-;; 				(close-input-port inp)
-;; 				(close-output-port oup)
-;; 				;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP
-;; 				)
-;; 			      (loop state))))))
-;;     ;; start N of them
-;;     (let loop ((thnum   0)
-;; 	       (threads '()))
-;;       (if (< thnum 100)
-;; 	  (let* ((th (make-thread listener (conc "listener" thnum))))
-;; 	    (thread-start! th)
-;; 	    (loop (+ thnum 1)
-;; 		  (cons th threads)))
-;; 	  (map thread-join! threads)))))
-;; 
-;; 
-;; 
-;; (define (wait-and-close uconn)
-;;   (thread-join! (udat-cmd-thread uconn))
-;;   (tcp-close (udat-socket uconn)))
-;; 
-;; 
 
 (define (tt:shutdown-server ttdat)
-  (let* ((cleanproc (tt-cleanup-proc ttdat))
-	 (port      (tt-port         ttdat)))
+  (let* ((host (tt-host ttdat))
+	 (port (tt-port ttdat))
+	 (sinf (tt-servinf-file ttdat)))
     (tt-state-set! ttdat 'shutdown)
     (portlogger:open-run-close portlogger:set-port port "released")
-    (if cleanproc (cleanproc))
-    (tcp-close (tt-socket ttdat)) ;; close up ports here
+    (if (file-exists? sinf)
+	(delete-file* sinf))
     ))
 
-;; (define (wait-and-close uconn)
-;;   (thread-join! (tt-cmd-thread uconn))
-;;   (tcp-close (tt-socket uconn)))
-
 ;; return servid
 ;; side-effects:
 ;;   ttdat-cleanup-proc is populated with function to remove the serverinfo file
 (define (tt:create-server-registration-file ttdat dbfname)
   (let* ((areapath (tt-areapath ttdat))
 	 (servdir  (tt:get-servinfo-dir areapath))
 	 (host     (tt-host ttdat))
 	 (port     (tt-port ttdat))
 	 (servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname))
-	 (serv-id (tt:mk-signature areapath))
-	 (clean-proc (lambda ()
-		       (delete-file* servinf)
-		       )))
+	 (serv-id (tt:mk-signature areapath)))
     (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname)
-    (tt-cleanup-proc-set! ttdat clean-proc)
     (tt-servinf-file-set! ttdat servinf)
     (with-output-to-file servinf
       (lambda ()
 	(print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname)))
       serv-id))
@@ -704,12 +678,28 @@
 ;; if more than one, wait one second and look again
 ;; future: ping oldest, if alive remove other :<dbfname> files
 ;;
 (define (tt:find-server areapath dbfname)
   (let* ((servdir  (tt:get-servinfo-dir areapath))
-	 (sfiles   (glob (conc servdir"/*:"dbfname))))
-    sfiles))
+	 (sfiles   (glob (conc servdir"/*:"dbfname)))
+	 (goodfiles '()))
+
+    ;; filter the files here by looking in processes table (if we are not main.db)
+    ;; and or look at the time stamp on the servinfo file, a running server will
+    ;; touch the file every minute (again, this will only apply for main.db)
+    (for-each (lambda (fname)
+		(let* ((age (- (current-seconds)(file-modification-time fname))))
+		  (if (> age 200) ;; can't trust it if over 200 seconds old
+		      (begin
+			(debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname", it is "age" seconds old")
+			(handle-exceptions
+			 exn
+			 (debug:print 0 *default-log-port* "WARNING: error attempting to remove stale servinfo file "fname)
+			 (delete-file fname))) ;; 
+		      (set! goodfiles (cons fname goodfiles)))))
+	      sfiles)
+    goodfiles))
 
 ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
 ;; example of what it's looking for in the log file:
 ;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
 ;;
@@ -719,12 +709,13 @@
         (dbprep-found 0)
 	(bad-dat      (list #f #f #f #f #f #f logf)))
      (let ((fdat     (handle-exceptions
 			 exn
 		       (begin
-			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
-			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))
+			 ;; BUG, TODO: add err checking, for now blanket ignore the errors?
+			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf
+					   ", exn="(condition->list exn))
 			 '()) ;; no idea what went wrong, call it a bad server, return empty list
 		       (with-input-from-file logf read-lines))))
        (if (null? fdat) ;; bad data, return bad-dat
 	   bad-dat
 	   (let loop ((inl  (car fdat))
@@ -750,10 +741,17 @@
 			    logf))
 		     (else
 		      (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
 		      bad-dat)))))))))
 
+(define *last-server-start* (make-hash-table))
+
+(define (tt:too-recent-server-start dbfname)
+  (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f)))
+    (and last-run-time
+	 (< (- (current-seconds) last-run-time) 5))))
+    
 ;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
 ;; if the target-host is set 
 ;; try running on that host
 ;;   incidental: rotate logs in logs/ dir.
 ;;
@@ -760,51 +758,63 @@
 (define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
   (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
   (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
   (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
   ;; mtest -server - -m testsuite:ext-tests -db 6.db
-  (let* ((dbfname  (dbmod:run-id->dbfname run-id))
-	 (load     (get-normalized-cpu-load))
-	 (trying   (length (tt:find-server areapath dbfname)))
-	 (nrun     (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
-    (cond
-     ((> load 2.0)
-      (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.")
-      (thread-sleep! 1))
-     ((> nrun 100)
-      (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
-      (thread-sleep! 1))
-     ((> trying 4)
-      (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
-      (thread-sleep! 1))
-     (else
-      (if (not (file-exists? (conc areapath"/logs")))
-	      (create-directory (conc areapath"/logs") #t))
-	  (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
-		 (cmdln     (conc
-			     mtexe
-			     " -startdir "areapath
-			     " -server - ";; (or target-host "-")
-			     " -m testsuite:"testsuite
-			     " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
-			     " " profile-mode
-			     (conc " >> " logfile " 2>&1 &"))))
-	    ;; we want the remote server to start in *toppath* so push there
-	    ;; (push-directory areapath) ;; use cd in the command line instead
-	    (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
-	    ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
-
-	    (system cmdln)
-	    ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
-	    ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
-	    ;; (setenv "NBFAKE_LOG" logfile)
-	    ;; (system (conc "cd "areapath" ; nbfake " cmdln))
-	    ;; (unsetenv "NBFAKE_QUIET")
-	    ;; (unsetenv "NBFAKE_LOG")
-	    
-	    ;;(pop-directory)
-	    )))))
+  (let* ((dbfname  (dbmod:run-id->dbfname run-id)))
+    (if (tt:too-recent-server-start dbfname)
+	#f
+	(let* ((load     (get-normalized-cpu-load))
+	       (srvrs    (tt:find-server areapath dbfname))
+	       (trying   (length srvrs))
+	       (nrun     (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
+	  (cond
+	   ((> load 2.0)
+	    (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
+	    (thread-sleep! 1)
+	    #f)
+	   ((> nrun 100)
+	    (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
+	    (thread-sleep! 1)
+	    #f)
+	   ((> trying 2)
+	    (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
+	    (thread-sleep! 1)
+	    #f)
+	   (else
+	    (if (not (file-exists? (conc areapath"/logs")))
+		(create-directory (conc areapath"/logs") #t))
+	    (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
+		   (cmdln (conc
+			       mtexe
+			       " -startdir "areapath
+			       " -server - ";; (or target-host "-")
+			       " -m testsuite:"testsuite
+			       " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
+			       " " profile-mode
+			       #;(conc " >> " logfile " 2>&1 &"))))
+	      ;; we want the remote server to start in *toppath* so push there
+	      ;; (push-directory areapath) ;; use cd in the command line instead
+	      (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
+	      ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
+
+	      (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
+	      (setenv "NBFAKE_LOG" logfile)
+	      (system (conc "cd "areapath" ; nbfake " cmdln))
+	      (unsetenv "NBFAKE_QUIET")
+	      (unsetenv "NBFAKE_LOG")
+	      ;; (system cmdln)
+	      (hash-table-set! *last-server-start* dbfname (current-seconds))
+	      ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
+	      ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
+	      ;; (setenv "NBFAKE_LOG" logfile)
+	      ;; (system (conc "cd "areapath" ; nbfake " cmdln))
+	      ;; (unsetenv "NBFAKE_QUIET")
+	      ;; (unsetenv "NBFAKE_LOG")
+	      
+	      ;;(pop-directory)
+	      #t)))))))
 
 ;;======================================================================
 ;; tcp connection stuff
 ;;======================================================================
 

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1964,11 +1964,12 @@
 ;; test steps
 ;;======================================================================
 
 ;; teststep-set-status! used to be here
 
-(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
+;; NOT NEEDED
+#;(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
   (let* ((testdat   (rmt:get-test-state-status-by-id run-id test-id)))
     (and testdat
 	 (equal? (car testdat) "KILLREQ"))))
 
 (define (test:tdb-get-rundat-count tdb)

Index: utils/mt_xterm
==================================================================
--- utils/mt_xterm
+++ utils/mt_xterm
@@ -20,18 +20,16 @@
 MT_TMPDISPLAY=$DISPLAY
 MT_TMPUSER=$USER
 MT_HOME=$HOME
 
 tmpfile=`mktemp`
-
-grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile
-source $tmpfile
-rm $tmpfile
-
-# if [ -e megatest.sh ];then
-#source megatest.sh
-#fi
+if [[ -e megatest.sh ]]; then
+  grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile
+  source $tmpfile
+  rm $tmpfile
+fi
+
 export DISPLAY=$MT_TMPDISPLAY
 export USER=$USER
 export HOME=$MT_HOME
 
 if [ x"$MT_XTERM_CMD" == "x" ];then