Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,18 @@
 #     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 TODO
 ====
 
+23WW45 - the revolution branch
+. Add "fast" db start option (no handshaking over NFS)
+. Add server-ro to server types (just "server" is fine for read/write).
+. Create pause-server and resume-server calls
+. Create rsync or cp sync to MTRAH function
+. Change rmt:send-receive to divert calls to read-only server when possible
+. Change start server to call main.db server for 1..N.db servers, block until server is read for use.
+
 23WW21
 . Dashboard needs its own cache db in /tmp
 
 23WW07
 . Remove use of *dbstruct-dbs*

Index: archive.scm
==================================================================
--- archive.scm
+++ archive.scm
@@ -359,11 +359,11 @@
 	(archive-dir  (if archive-info (cdr archive-info) #f))
 	(archive-id   (if archive-info (car archive-info) -1))
         (home-host    (server:choose-server *toppath* 'homehost))
         (archive-time (seconds->std-time-str (current-seconds)))
         (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
-        (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
+        (tmp-db-path (conc (dbfile:make-tmpdir-name *toppath* "") "/megatest.db"))
         (dbfile             (conc  archive-staging-db "/megatest.db"))) 
         (create-directory archive-staging-db #t)
         (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
             (if (eq? exit-code 0)   
               (case archiver

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -21,10 +21,11 @@
 (declare (unit common))
 (declare (uses commonmod))
 (declare (uses rmtmod))
 (declare (uses debugprint))
 (declare (uses mtargs))
+        
 
 (use srfi-1 data-structures posix regex-case (prefix base64 base64:)
      format dot-locking csv-xml z3 udp ;; sql-de-lite
      hostinfo md5 message-digest typed-records directory-utils stack
      matchable regex posix (srfi 18) extras ;; tcp 
@@ -247,11 +248,11 @@
 (define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
 (define *common:this-exe-dir*      (pathname-directory *common:this-exe-fullpath*))
 (define *common:this-exe-name*     (pathname-strip-directory *common:this-exe-fullpath*))
 
 (define (common:get-sync-lock-filepath)
-  (let* ((tmp-area     (common:get-db-tmp-area))
+  (let* ((tmp-area     (common:make-tmpdir-name *toppath* ""))
          (lockfile     (conc tmp-area "/megatest.db.lock")))
     lockfile))
 
 (define *common:logpro-exit-code->status-sym-alist*
   '( ( 0 . pass )
@@ -1533,11 +1534,11 @@
 ;;
 (define (common:lazy-modification-time fpath)
   (handle-exceptions
       exn
     (begin
-      (debug:print 0 *default-log-port* "Failed to get modifcation time for " fpath ", treating it as zero. exn=" exn)
+      (debug:print 2 *default-log-port* "Failed to get modification time for " fpath ", treating it as zero. exn=" exn)
       0)
     (if (file-exists? fpath)
 	(file-modification-time fpath)
 	0)))
 
@@ -2280,11 +2281,11 @@
 (define (common:check-db-dir-space)
   (let* ((required (string->number 
                     ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks.
 		    (or (configf:lookup *configdat* "setup" "dbdir-space-required")
 			"1000000")))
-	 (dbdir    (common:get-db-tmp-area)) ;; (db:get-dbdir))
+	 (dbdir    (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir))
 	 (tdbspace (common:check-space-in-dir dbdir required))
 	 (mdbspace (common:check-space-in-dir *toppath* required)))
     (sort (list tdbspace mdbspace) (lambda (a b)
 				     (< (cadr a)(cadr b))))))
 

Index: commonmod.scm
==================================================================
--- commonmod.scm
+++ commonmod.scm
@@ -160,10 +160,17 @@
 	'()))) ;; should it return empty list or #f to indicate not set?
 
 
 (define (get-section cfgdat section)
   (hash-table-ref/default cfgdat section '()))
+
+(define (common:make-tmpdir-name areapath tmpadj)
+  (let* ((area (pathname-file areapath))
+         (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb")))
+    (unless (directory-exists? dname)
+      (create-directory dname #t))
+    dname))
 
 ;; dot-locking egg seems not to work, using this for now
 ;; if lock is older than expire-time then remove it and try again
 ;; to get the lock
 ;;

Index: dashboard-tests.scm
==================================================================
--- dashboard-tests.scm
+++ dashboard-tests.scm
@@ -463,11 +463,11 @@
 
 ;;======================================================================
 ;;
 ;;======================================================================
 (define (dashboard-tests:examine-test run-id test-id) ;; run-id run-key origtest)
-  (let* ((db-path       (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
+  (let* ((db-path       (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
 	 (dbstruct      #f) ;; NOT USED
 	 (testdat        (rmt:get-test-info-by-id run-id test-id)) ;; (db:get-test-info-by-id dbstruct run-id test-id))
 	 (db-mod-time   0) ;; (file-modification-time db-path))
 	 (last-update   0) ;; (current-seconds))
 	 (request-update #t))

Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -402,12 +402,12 @@
     (dboard:setup-tabdat dat)
     (dboard:setup-num-rows dat)
     dat))
 
 (define (dboard:setup-tabdat tabdat)
-  (dboard:tabdat-dbdir-set! tabdat (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
-  (dboard:tabdat-dbfpath-set! tabdat (db:dbfile-path))
+  (dboard:tabdat-dbdir-set! tabdat (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
+  (dboard:tabdat-dbfpath-set! tabdat (common:make-tmpdir-name *toppath* ""))
   (dboard:tabdat-monitor-db-path-set! tabdat (conc (dboard:tabdat-dbdir tabdat) "/monitor.db"))
 
 
   ;; HACK ALERT: this is a hack, please fix.
   (dboard:tabdat-ro-set! tabdat (not (file-read-access? (dboard:tabdat-dbfpath tabdat))))
@@ -928,16 +928,16 @@
 		  (hash-table-set!    (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
 	      (if (or (null? tal)
 		      (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
 		  (begin
 		    (when (> elapsed-time 2)   
-                      (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
+                      (debug:print 2 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
                       (let* ((old-val (iup:attribute *tim* "TIME"))
                              (new-val (number->string (inexact->exact (floor (* 2  (string->number old-val)))))))
                         (if (< (string->number new-val) 5000)
                             (begin
-			      (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
+			      (debug:print 2 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
 			      (iup:attribute-set! *tim* "TIME" new-val)))))
 		    (dboard:tabdat-allruns-set! tabdat new-res)
 		    maxtests)
 		  (if (> (dboard:rundat-run-data-offset run-struct) 0)
 		      (loop run tal new-res newmaxtests) ;; not done getting data for this run
@@ -3112,11 +3112,11 @@
      (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: "
 		  ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn)
      (current-seconds)) ;; something went wrong - just print an error and return current-seconds
    (common:max (map (lambda (filen)
 		      (file-modification-time filen))
-		    (glob (conc dbdir "/*.db*"))))))
+		    (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db")))))))
 
 (define (dashboard:monitor-changed? commondat tabdat)
   (let* ((run-update-time (current-seconds))
 	 (monitor-db-path (dboard:tabdat-monitor-db-path tabdat))
 	 (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path))

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -133,11 +133,11 @@
 	   default)))
    (apply sqlite3:first-result db stmt params)))
 
 (define (db:setup do-sync)
   (assert *toppath* "FATAL: db:setup called before launch:setup has been run.")
-  (let* ((tmpdir (common:get-db-tmp-area)))
+  (let* ((tmpdir (common:make-tmpdir-name *toppath* "")))
     (if (not *dbstruct-dbs*)
 	(dbfile:setup do-sync *toppath* tmpdir)
 	*dbstruct-dbs*)))
 
 ;; moved from dbfile
@@ -267,17 +267,10 @@
 		     ", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
 		     ", location: "  ((condition-property-accessor 'exn 'location)  exn)
 		     ))
 
 
-;; NB// #f => return dbdir only
-;;      (was planned to be;  zeroth db with name=main.db)
-;; 
-;; If run-id is #f return to create and retrieve the path where the db will live.
-;;
-(define db:dbfile-path common:get-db-tmp-area)
-
 (define (db:set-sync db)
   (let ((syncprag (configf:lookup *configdat* "setup" "sychronous")))
     (sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";")))) 
 
 
@@ -467,11 +460,11 @@
 	 (get-mtime shm-file))))
 	 
 ;; (define (db:all-db-sync dbstruct)
 ;;   (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db))
 ;; 	 (data-synced       0) ;; count of changed records
-;;     (tmp-area       (common:get-db-tmp-area))
+;;     (tmp-area       (common:make-tmpdir-name *toppath*))
 ;;     (dbfiles        (glob (conc tmp-area"/.mtdb/*.db")))
 ;;     (sync-durations (make-hash-table))
 ;;     (no-sync-db        (db:open-no-sync-db)))
 ;;     (for-each
 ;;      (lambda (file) ;; tmp db file
@@ -556,11 +549,11 @@
 ;;  run-ids: '(1 2 3 ...) or #f (for all)
 ;;
 (define (db:multi-db-sync dbstruct . options)
   (let* (;; (dbdat       (db:open-db dbstruct #f dbfile:db-init-proc))
 	 (data-synced 0) ;; count of changed records
-	 (tmp-area       (common:get-db-tmp-area))
+	 (tmp-area       (common:make-tmpdir-name *toppath* ""))
 	 (old2new (member 'old2new options))
 	 (dejunk (member 'dejunk options))
 	 (killservers (member 'killservers options))
 	 (src-area (if old2new *toppath* tmp-area))
 	 (dest-area (if old2new tmp-area *toppath*))
@@ -1254,11 +1247,11 @@
 ;; no-sync.db - small bits of data to be shared between servers
 ;;======================================================================
 
 (define (db:get-dbsync-path)
   (case (rmt:transport-mode)
-    ((http)(common:get-db-tmp-area))
+    ((http)(common:make-tmpdir-name *toppath* ""))
     ((tcp) (conc *toppath*"/.mtdb"))
     ((nfs) (conc *toppath*"/.mtdb"))
     (else "/tmp/dunno-this-gonna-exist")))
 
 ;; This is needed for api.scm
@@ -1580,11 +1573,11 @@
 ;; TODO: Switch this to use max(update_time) from each run db? Then if using a server there is no disk traffic (using cachedb db) ???
 ;;
 ;; NOTE: This DOESN'T (necessarily) get the real run ids, but the number of the <number>.db!!
 
 (define (db:get-changed-run-ids since-time)
-  (let* ((dbdir      (db:dbfile-path)) ;; (configf:lookup *configdat* "setup" "dbdir"))
+  (let* ((dbdir      (common:make-tmpdir-name *toppath* "")) ;; (configf:lookup *configdat* "setup" "dbdir"))
 	 (alldbs     (glob (conc *toppath* "/.mtdb/[0-9]*.db*")))
 	 (changed    (filter (lambda (dbfile)
 			       (> (file-modification-time dbfile) since-time))
 			     alldbs)))
     (delete-duplicates
@@ -4314,11 +4307,11 @@
         ))))
 
 ;; sync for filesystem local db writes
 ;;
 (define (db:run-lock-and-sync no-sync-db)
-  (let* ((tmp-area       (common:get-db-tmp-area))
+  (let* ((tmp-area       (common:make-tmpdir-name *toppath* ""))
 	 (dbfiles        (glob (conc tmp-area"/.mtdb/*.db")))
 	 (sync-durations (make-hash-table)))
     ;; (debug:print-info 0 *default-log-port* "lock-and-sync, dbfiles: "dbfiles)
     (for-each
      (lambda (file)
@@ -4370,11 +4363,11 @@
 	(sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
  	(debug-mode         (debug:debug-mode 1))
  	(last-time          (current-seconds))     ;; last time through the sync loop
  	(no-sync-db         (db:open-no-sync-db))
  	(sync-duration      0)  ;; run time of the sync in milliseconds
-	(tmp-area           (common:get-db-tmp-area)))
+	(tmp-area           (common:make-tmpdir-name *toppath* "")))
     ;; Sync moved to http-transport keep-running loop
     (debug:print-info 2 *default-log-port* "Periodic copy-based sync thread started. syncer is copy-sync, tmp-area is " tmp-area)
     (debug:print-info 3 *default-log-port* "watchdog starting. syncer is copy-sync pid="(current-process-id));;  " this-wd-num="this-wd-num)
     
     (if (and legacy-sync (not *time-to-exit*))
@@ -4478,11 +4471,11 @@
  		       (for-each
  			(lambda (subdb)
  			  (let* (;;(dbstruct (db:setup))
  				 (mtdb       (dbr:subdb-mtdb subdb))
  				 (mtpath     (db:dbdat-get-path mtdb))
- 				 (tmp-area   (common:get-db-tmp-area))
+ 				 (tmp-area   (common:make-tmpdir-name *toppath* ""))
  				 (res        (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
  			    (set! sync-duration (- (current-milliseconds) sync-start))
  			    (if (> res 0) ;; some records were transferred, keep the db alive
  				(begin
  				  (mutex-lock! *heartbeat-mutex*)

Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -242,11 +242,12 @@
           #f
   )
 )
 
 (define (dbfile:make-tmpdir-name areapath tmpadj)
-  (let* ((dname (conc "/tmp/"(current-user-name)"/" (string-translate areapath "/" ".") tmpadj)))
+  (let* ((area (pathname-file areapath))
+         (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb")))
     (unless (directory-exists? dname)
       (create-directory dname #t))
     dname))
 
 (define (dbfile:run-id->path apath run-id)
@@ -487,11 +488,11 @@
 ;; NOTE: this is already protected by mutex *no-sync-db-mutex*
 ;;
 (define (dbfile:raw-open-no-sync-db dbpath)
   (if (not (file-exists? dbpath))
       (create-directory dbpath #t))
-  (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db")
+  (debug:print-info 2 *default-log-port* "(dbfile:raw-open-no-sync-db: Opening "dbpath"/no-sync.db")
   (let* ((dbname    (conc dbpath "/no-sync.db"))
 	 (db-exists (file-exists? dbname))
 	 (init-proc (lambda (db)
 		      (sqlite3:with-transaction
 		       db
@@ -675,20 +676,26 @@
 ;;    fails    returns  (#f lock-creation-time identifier)
 ;;    succeeds (returns (#t lock-creation-time identifier)
 ;; use (db:no-sync-del! db keyname) to release the lock
 ;;
 (define (db:no-sync-get-lock-with-id db keyname identifier)
+  (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: db: " db " keyname: " keyname " identifier: " identifier)
   (sqlite3:with-transaction
    db
    (lambda ()
      (condition-case
       (let* ((curr-val (db:no-sync-get/default db keyname #f)))
+        (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: curr-val: " curr-val)
 	(if curr-val
 	    (match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier
 	       ((timestamp . ident)
 		(cons (equal? ident identifier) timestamp))
-	       (else (cons #f 'malformed-lock)))  ;; lock malformed
+	       (else 
+                (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: malformed lock")
+                (cons #f 'malformed-lock)
+                )
+            )  ;; lock malformed
 	    (let ((curr-sec (current-seconds))
 		  (lock-value (if identifier
 				  (conc (current-seconds)"+"identifier)
 				  (current-seconds))))
 	      (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value)

Index: dbmod.scm
==================================================================
--- dbmod.scm
+++ dbmod.scm
@@ -198,11 +198,11 @@
   (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
 	 (dbfname      (or dbfname-in (dbmod:run-id->dbfname run-id)))
 	 (dbpath       (dbmod:get-dbdir dbstruct))             ;; directory where all the .db files are kept
 	 (dbfullname   (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
 	 (dbexists     (file-exists? dbfullname))
-	 (tmpdir       (dbfile:make-tmpdir-name areapath tmpadj))
+	 (tmpdir       (common:make-tmpdir-name areapath tmpadj))
 	 (tmpdb        (let* ((fname (conc tmpdir"/"dbfname)))
 			 fname))
 	 (cachedb        (dbmod:open-cachedb-db init-proc
 					    ;; (if (eq? (dbfile:cache-method) 'cachedb)
 					    ;; 	#f

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 ;; (declare (unit megatest-version))
 
-(define megatest-version 1.8017)
+(define megatest-version 1.8018)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -968,11 +968,11 @@
 	   (tl         (launch:setup))
 	   (keys       (keys:config-get-fields *configdat*)))
       (case (rmt:transport-mode)
 	((tcp)
 	 (let* ((timeout    (server:expiration-timeout)))
-	   (debug:print 0 *default-log-port* "INFO: Running using tcp method with server timeout of "timeout)
+	   (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout)
 	   (tt-server-timeout-param timeout)
 	   (if dbfname
 	       (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)
 	       (begin
 		 (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.")
@@ -1091,10 +1091,14 @@
               sfiles
             )
           )
        )
        dbfiles
+     )
+     ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id.
+     (if (file-exists? (conc *toppath* "/.mtdb/no-sync.db"))
+       (delete-file (conc *toppath* "/.mtdb/no-sync.db"))
      )
      (set! *didsomething* #t)
      (exit)  
   )
 )
@@ -2132,13 +2136,13 @@
              (exit 1)))
          (if (common:file-exists? (conc  *toppath* "/megatest.db"))
              (begin  
                (debug:print-info 1 *default-log-port* "File " (conc  *toppath* "/megatest.db") " already exists. Please remove it before trying to replicate db")
                (exit 1)))
-         (if (and (common:get-db-tmp-area) (> (length (directory   (common:get-db-tmp-area) #f)) 0))
+         (if (and (common:make-tmpdir-name *toppath* "") (> (length (directory   (common:make-tmpdir-name *toppath* "") #f)) 0))
            (begin
-           (debug:print-info 1 *default-log-port* (common:get-db-tmp-area) " not empty. Please remove it before trying to replicate db")
+           (debug:print-info 1 *default-log-port* (common:make-tmpdir-name *toppath* "") " not empty. Please remove it before trying to replicate db")
            (exit 1)))    
           ;; check if timestamp 
           (let* ((source (args:get-arg "-source"))
                 (src     (if (not (equal? (substring source 0 1) "/"))
                              (conc (current-directory) "/" source)

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -167,11 +167,11 @@
     (mutex-unlock! *db-stats-mutex*)
     res))
 
 (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
   (let* ((qry-is-write    (not (member cmd api:read-only-queries)))
-	 (db-file-path    (db:dbfile-path)) ;;  0))
+	 (db-file-path    (common:make-tmpdir-name *toppath* "")) ;;  0))
 	 (dbstructs-local (db:setup #t))
 	 (read-only       (not (file-write-access? db-file-path)))
 	 (start           (current-milliseconds))
 	 (resdat          (if (not (and read-only qry-is-write))
 			      (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params))))

Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -84,11 +84,11 @@
 	     (tasks:open-db numretries (- numretries 1)))
 	   (begin
 	     (print-call-chain (current-error-port))
 	     (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
 	     (debug:print 5 *default-log-port* " exn=" (condition->list exn))))
-       (let* ((dbpath        (db:dbfile-path )) ;; (tasks:get-task-db-path))
+       (let* ((dbpath        (common:make-tmpdir-name *toppath* "")) ;; (tasks:get-task-db-path))
 	      (dbfile       (conc dbpath "/monitor.db"))
 	      (avail        (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
 	      (exists       (common:file-exists? dbpath))
 	      (write-access (file-write-access? dbpath))
 	      (mdb          (cond ;; what the hek is *toppath* doing here?

Index: tcp-transportmod.scm
==================================================================
--- tcp-transportmod.scm
+++ tcp-transportmod.scm
@@ -132,26 +132,31 @@
 ;; do all the busy work of finding and setting up conn for
 ;; connecting to a server
 ;; 
 (define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
   (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
+  (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
   (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
 	 (server-start-proc (lambda ()
+			      (print "dbfname: "dbfname)
+			      (assert (equal? dbfname "main.db") ;; only main.db is started here
+				      "FATAL: called server-start-proc for db other than main.db")
 			      (tt:server-process-run
 			       (tt-areapath ttdat)
 			       testsuite ;; (dbfile:testsuite-name)
 			       (common:find-local-megatest)
-			       run-id))))
+			       run-id))
+			    ))
     (if conn
 	(begin 
-          ; (debug:print-info 0 *default-log-port* "already connected to the server")
+          (debug:print-info 2 *default-log-port* "already connected to a server")
            conn) ;; we are already connected to the server
 	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
 	  (match sdat
 	    ((host port start-time server-id pid dbfname2 servinffile)
 	     (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
-             ;(debug:print-info 0 *default-log-port* "in match servinffile:" servinffile)
+             (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
 	     (let* ((host-port (conc host":"port))
 		    (conn (make-tt-conn
 			   host: host
 			   port: port
 			   host-port: host-port
@@ -162,34 +167,40 @@
 			   pid: pid)))
 	       ;; verify we can talk to this server
 	       (let* ((result   (tt:timed-ping host port server-id))
 		      (ping-res (car result))
 		      (ping     (cdr result)))
-                 (debug:print-info 0 *default-log-port* "ping time: " ping)
+                 (debug:print-info 2 *default-log-port* "host " host " port " port " ping time: " ping " result " ping-res)
 		 (case ping-res
 		   ((running)
+                    (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table")
 		    (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good?
 		    conn)
 		   ((starting)
 		    (thread-sleep! 0.5)
+                    (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect")
 		    (tt:client-connect-to-server ttdat dbfname run-id testsuite))
 		   (else
 		    (let* ((curr-secs (current-seconds)))
 		      ;; rm the (last server) would go here
 		      (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
 			  (begin
 			    (tt-last-serv-start-set! ttdat curr-secs)
-			    (server-start-proc))) ;; start server if 30 sec since last attempt
+			    (server-start-proc))) ;; start server if 10 sec since last attempt
 		      (thread-sleep! 1)
+                      (debug:print-info 2 *default-log-port* "server ping result was neither running nor starting. Retrying connect")
 		      (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
 	    (else ;; no good server found, if haven't started server in > 5 secs, start another
-	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; BUG - grow this number really do not want to swamp the machine with servers
+	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
 		 (begin
-		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
+		   (debug:print-info 0 *default-log-port* "Starting server for "dbfname)
 		   (server-start-proc)
-		   (tt-last-serv-start-set! ttdat (current-seconds))))
+		   (tt-last-serv-start-set! ttdat (current-seconds))
+                   (thread-sleep! 3)
+                   ))
 	     (thread-sleep! 1)
+             (debug:print-info 0 *default-log-port* "Connect to server for " dbfname)
 	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
 
 (define (tt:timed-ping host port server-id)
   (let* ((start-time (current-milliseconds))
 	 (result     (tt:ping host port server-id)))
@@ -223,11 +234,13 @@
 ;; client side handler
 ;;
 ;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
 ;;
 (define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
+  (debug:print 2 *default-log-port* "tt:handler cmd: " cmd " run-id: " run-id " attemptnum: " attemptnum)
   ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
+  ;; connect-to-server will start a server if needed.
   (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
     (if conn
 	;; have connection, call the server
 	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
 	  ;; res is (status errmsg result meta)
@@ -252,11 +265,11 @@
 		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
 		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
 	       (else
 		result)))
 	    (else ;; did not receive properly formated result
-	     (if (not res) ;; tt:handler is telling us that communication failed
+	     (if (not res) ;; tt:send-receive telling us that communication failed
 		 (let* ((host    (tt-conn-host conn))
 			(port    (tt-conn-port conn))
 			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
 			(pid     (tt-conn-pid  conn))
                         ;;(servinf (tt-conn-servinf-file conn))) 
@@ -285,11 +298,11 @@
 				     ;; try again
 				     (thread-sleep! 0.25) ;; dunno, I think this needs to be here
 				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
 				   ))))
 		       (begin ;; no server file, delay and try again
-			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
+			 (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ")
 			 (thread-sleep! 0.5)
 			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
 		 (begin ;; this case is where res is malformed. Probably should abort
 		   (assert #f "FATAL: tt:handler received bad data "res)
 		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
@@ -464,14 +477,16 @@
 ;; Server viability is checked in keep-running. Blindly start and run here.
 ;;
 (define (tt:start-server areapath run-id dbfname-in handler keys)
   (assert areapath "FATAL: areapath not provided for tt:start-server")
   ;; is there already a server for this dbfile? Then exit.
+  (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in)
   (let* ((ttdat   (make-tt areapath: areapath))
 	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
 	 (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
-    (if (> (length servers) 4)
+         (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname)
+    (if (> (length servers) 0)
 	(begin
 	  (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
 	  (exit))
 	(let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
 	  (tt-handler-set! ttdat (handler dbstruct))
@@ -528,12 +543,11 @@
 		    (dbfile:with-no-sync-db nosyncdbpath
 					    (lambda (db)
 					      (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
 						(debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
 						(db:no-sync-del! db dbfname)
-						#;(if dbtmpname
-						    (delete-file dbtmpname))))))))
+                                                ))))))
     (set! *server-info* ttdat)
     (let loop ((count 0))
       (if (> count 240)
 	  (begin
 	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
@@ -576,10 +590,11 @@
 			  (if (and res (common:low-noise-print 120 "top server message"))
 			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
 						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
 			  res))
 		       (else
+                        ;; wrong servinfo file
 			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
 			(let* ((leadsrv (car servers)))
 			  (match leadsrv
 			    ((host port startseconds server-id pid dbfname servinfofile)
 			     (let* ((result  (tt:timed-ping host port server-id))
@@ -705,10 +720,15 @@
 ;; future: ping oldest, if alive remove other :<dbfname> files
 ;;
 (define (tt:find-server areapath dbfname)
   (let* ((servdir  (tt:get-servinfo-dir areapath))
 	 (sfiles   (glob (conc servdir"/*:"dbfname))))
+    
+    ;; filter the files here by looking in processes table (if we are not main.db)
+    ;; and or look at the time stamp on the servinfo file, a running server will
+    ;; touch the file every minute (again, this will only apply for main.db)
+    
     sfiles))
 
 ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
 ;; example of what it's looking for in the log file:
 ;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
@@ -719,12 +739,13 @@
         (dbprep-found 0)
 	(bad-dat      (list #f #f #f #f #f #f logf)))
      (let ((fdat     (handle-exceptions
 			 exn
 		       (begin
-			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
-			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))
+			 ;; BUG, TODO: add err checking, for now blanket ignore the errors?
+			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf
+					   ", exn="(condition->list exn))
 			 '()) ;; no idea what went wrong, call it a bad server, return empty list
 		       (with-input-from-file logf read-lines))))
        (if (null? fdat) ;; bad data, return bad-dat
 	   bad-dat
 	   (let loop ((inl  (car fdat))
@@ -762,22 +783,26 @@
   (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
   (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
   ;; mtest -server - -m testsuite:ext-tests -db 6.db
   (let* ((dbfname  (dbmod:run-id->dbfname run-id))
 	 (load     (get-normalized-cpu-load))
-	 (trying   (length (tt:find-server areapath dbfname)))
+	 (srvrs    (tt:find-server areapath dbfname))
+	 (trying   (length srvrs))
 	 (nrun     (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
     (cond
      ((> load 2.0)
-      (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.")
-      (thread-sleep! 1))
+      (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
+      (thread-sleep! 1)
+      #f)
      ((> nrun 100)
       (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
-      (thread-sleep! 1))
-     ((> trying 4)
+      (thread-sleep! 1)
+      #f)
+     ((> trying 2)
       (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
-      (thread-sleep! 1))
+      (thread-sleep! 1)
+      #f)
      (else
       (if (not (file-exists? (conc areapath"/logs")))
 	      (create-directory (conc areapath"/logs") #t))
 	  (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
 		 (cmdln     (conc
@@ -788,11 +813,11 @@
 			     " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
 			     " " profile-mode
 			     (conc " >> " logfile " 2>&1 &"))))
 	    ;; we want the remote server to start in *toppath* so push there
 	    ;; (push-directory areapath) ;; use cd in the command line instead
-	    (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
+	    (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
 	    ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
 
 	    (system cmdln)
 	    ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
 	    ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
@@ -800,11 +825,11 @@
 	    ;; (system (conc "cd "areapath" ; nbfake " cmdln))
 	    ;; (unsetenv "NBFAKE_QUIET")
 	    ;; (unsetenv "NBFAKE_LOG")
 	    
 	    ;;(pop-directory)
-	    )))))
+	    #t)))))
 
 ;;======================================================================
 ;; tcp connection stuff
 ;;======================================================================
 

Index: utils/mt_xterm
==================================================================
--- utils/mt_xterm
+++ utils/mt_xterm
@@ -20,18 +20,16 @@
 MT_TMPDISPLAY=$DISPLAY
 MT_TMPUSER=$USER
 MT_HOME=$HOME
 
 tmpfile=`mktemp`
-
-grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile
-source $tmpfile
-rm $tmpfile
-
-# if [ -e megatest.sh ];then
-#source megatest.sh
-#fi
+if [[ -e megatest.sh ]]; then
+  grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile
+  source $tmpfile
+  rm $tmpfile
+fi
+
 export DISPLAY=$MT_TMPDISPLAY
 export USER=$USER
 export HOME=$MT_HOME
 
 if [ x"$MT_XTERM_CMD" == "x" ];then