Index: client.scm
==================================================================
--- client.scm
+++ client.scm
@@ -180,18 +180,18 @@
 
                                   )))
 		(if (and start-res
 			 ping-res)
 		    (begin
-		      (hash-table-set! *runremote* run-id start-res)
+		      (set! *runremote* start-res) ;; (hash-table-set! *runremote* run-id start-res)
 		      (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
 		      start-res)
 		    (begin    ;; login failed but have a server record, clean out the record and try again
 		      (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
 		      (case *transport-type* 
 			((http)(http-transport:close-connections run-id)))
-		      (hash-table-delete! *runremote* run-id)
+		      (set! *runremote* #f)  ;; (hash-table-delete! *runremote* run-id)
 		      (tasks:kill-server-run-id run-id)
 		      (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat)
 							   run-id 
 							   (tasks:hostinfo-get-interface server-dat)
 							   (tasks:hostinfo-get-port      server-dat)

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -7,11 +7,11 @@
 ;;  This program is distributed WITHOUT ANY WARRANTY; without even the
 ;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 ;;  PURPOSE.
 ;;======================================================================
 
-(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records)
+(use srfi-1 posix regex-case base64 format dot-locking csv-xml z3 sql-de-lite hostinfo md5 message-digest typed-records directory-utils)
 (require-extension regex posix)
 
 (require-extension (srfi 18) extras tcp rpc)
 
 (import (prefix sqlite3 sqlite3:))
@@ -90,11 +90,12 @@
 (define *db-stats*            (make-hash-table)) ;; hash of vectors < count duration-total >
 (define *db-stats-mutex*      (make-mutex))
 (define *db-sync-mutex*       (make-mutex))
 (define *db-multi-sync-mutex* (make-mutex))
 (define *db-local-sync*       (make-hash-table)) ;; used to record last touch of db
-(define *last-db-access*      (current-seconds))  ;; update when db is accessed via server
+(define *db-last-sync*        0)                 ;; last time the sync to megatest.db happened 
+(define *last-db-access*      (current-seconds)) ;; update when db is accessed via server
 (define *db-write-access*     #t)
 (define *task-db*             #f) ;; (vector db path-to-db)
 (define *db-access-allowed*   #t) ;; flag to allow access
 (define *db-access-mutex*     (make-mutex))
 (define *db-cache-path*       #f)
@@ -101,11 +102,11 @@
 
 ;; SERVER
 (define *my-client-signature* #f)
 (define *transport-type*    'http)
 (define *transport-type*    'http)             ;; override with [server] transport http|rpc|nmsg
-(define *runremote*         (make-hash-table)) ;; if set up for server communication this will hold <host port>
+(define *runremote*         #f)                ;; if set up for server communication this will hold <host port>
 (define *max-cache-size*    0)
 (define *logged-in-clients* (make-hash-table))
 (define *server-id*         #f)
 (define *server-info*       #f)
 (define *time-to-exit*      #f)
@@ -200,10 +201,34 @@
    ;; 'old2new
    'new2old
    'schema)
   (if (common:version-changed?)
       (common:set-last-run-version)))
+
+;; Rotate logs, logic: 
+;;                 if > 500k and older than 1 week:
+;;                     remove previous compressed log and compress this log
+;; WARNING: This proc operates assuming that it is in the directory above the
+;;          logs directory you wish to log-rotate.
+;;
+(define (common:rotate-logs)
+  (if (not (directory-exists? "logs"))(create-directory "logs"))
+  (directory-fold 
+   (lambda (file rem)
+     (if (and (string-match "^.*.log" file)
+	      (> (file-size (conc "logs/" file)) 200000))
+	 (let ((gzfile (conc "logs/" file ".gz")))
+	   (if (file-exists? gzfile)
+	       (begin
+		 (debug:print-info 0 *default-log-port* "removing " gzfile)
+		 (delete-file gzfile)))
+	   (debug:print-info 0 *default-log-port* "compressing " file)
+	   (system (conc "gzip logs/" file)))))
+   '()
+   "logs"))
+    
+
 
 ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
 ;;
 (define (common:exit-on-version-changed)
   (if (common:version-changed?)
@@ -482,17 +507,17 @@
 ;;======================================================================
 
 (define (common:legacy-sync-recommended)
   (or (and (common:get-homehost)
 	   (cdr (common:get-homehost)))
-      (args:get-arg "-runtests")
-      (args:get-arg "-run")
+      ;;(args:get-arg "-runtests")
+      ;;(args:get-arg "-run")
       (args:get-arg "-server")
       ;; (args:get-arg "-set-run-status")
-      (args:get-arg "-remove-runs")
+      ;;(args:get-arg "-remove-runs")
       ;; (args:get-arg "-get-run-status")
-      (args:get-arg "-use-db-cache") ;; feels like a bad idea ...
+      ;;(args:get-arg "-use-db-cache") ;; feels like a bad idea ...
       ))
 
 (define (common:legacy-sync-required)
   (configf:lookup *configdat* "setup" "megatest-db"))
 
@@ -536,11 +561,11 @@
 	    legacy-sync)
 	(let loop ()
 	  ;; sync for filesystem local db writes
 	  ;;
 	  (let ((start-time   (current-seconds)))
-	    (common:sync-to-megatest.db 'local-sync-flags)
+	    ;; (common:sync-to-megatest.db 'local-sync-flags)
 	    (if (and debug-mode
 		     (> (- start-time last-time) 60))
 		(begin
 		  (set! last-time start-time)
 		  (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -196,11 +196,12 @@
 			   dir-writable )))
     (if file-write ;; dir-writable
 	(let (;; (lock    (obtain-dot-lock fname 1 5 10))
 	      (db      (sqlite3:open-database fname)))
 	  (sqlite3:set-busy-handler! db (make-busy-timeout 136000))
-	  (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
+	  ;; (db:set-sync db)
+	  (sqlite3:execute db "PRAGMA synchronous = NORMAL;")
 	  (if (not file-exists)
 	      (begin
 		(if (string-match "^/tmp/.*" fname) ;; this is a file in /tmp
 		    (sqlite3:execute db "PRAGMA journal_mode=WAL;")
 		    (print "Creating " fname " in NON-WAL mode."))
@@ -310,15 +311,18 @@
 	;; (maindb (dbr:dbstruct-main  dbstruct))
 	;; (refdb  (dbr:dbstruct-refdb dbstruct))
         (tmpdb   (dbr:dbstruct-tmpdb dbstruct))
 	(mtdb    (dbr:dbstruct-mtdb dbstruct))
         (refndb  (dbr:dbstruct-refndb dbstruct))
+	(start-t (current-seconds))
 	;; (runid  (dbr:dbstruct-run-id dbstruct))
 	)
     (debug:print-info 4 *default-log-port* "Syncing for run-id: " run-id)
-    ;; (mutex-lock! *http-mutex*)
-    (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb)))
+    (mutex-lock! *db-sync-mutex*)
+    (db:sync-tables (db:sync-all-tables-list dbstruct) (cons *db-last-sync* "last_update") tmpdb refndb mtdb)
+    (set! *db-last-sync* start-t)
+    (mutex-unlock! *db-sync-mutex*)))
 ;;    (if (eq? run-id 0)
 ;;	;; runid equal to 0 is main.db
 ;;	(if maindb
 ;;	    (if (or (not (number? mtime))
 ;;		    (not (number? stime))
@@ -371,11 +375,11 @@
 
 ;; close all opened run-id dbs
 (define (db:close-all dbstruct)
   (if (dbr:dbstruct? dbstruct)
       (begin
-        (db:sync-touched dbstruct 0 force-sync: #t)
+        ;; (db:sync-touched dbstruct 0 force-sync: #t) ;; NO. Do not do this here. Instead we rely on a server to be started when there are writes, even if the server itself is not going to be used as a server.
         (let ((tdb (db:dbdat-get-db (dbr:dbstruct-tmpdb  dbstruct)))
               (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb   dbstruct)))
               (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct))))
           (if tdb (sqlite3:finalize! tdb))
           (if mdb (sqlite3:finalize! mdb))
@@ -3155,19 +3159,25 @@
 	 (begin
 	   (debug:print-error 0 *default-log-port* "reception failed. Received " msg " but cannot translate it.")
 	   msg))) ;; crude reply for when things go awry
     ((zmq nmsg)(with-input-from-string msg (lambda ()(deserialize))))
     (else msg))) ;; rpc
+
+;; This is to be the big daddy call
 
 (define (db:test-set-status-state dbstruct run-id test-id status state msg)
   (let ((dbdat  (db:get-db dbstruct run-id)))
     (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
 	(db:general-call dbdat 'set-test-start-time (list test-id)))
-    (if msg
-	(db:general-call dbdat 'state-status-msg (list state status msg test-id))
-	(db:general-call dbdat 'state-status     (list state status test-id)))
-     (mt:process-triggers run-id test-id state status)))
+    ;; (if msg
+    ;; 	(db:general-call dbdat 'state-status-msg (list state status msg test-id))
+    ;; 	(db:general-call dbdat 'state-status     (list state status test-id)))
+    (db:set-state-status-and-roll-up-items dbstruct run-id test-id #f state status msg)
+    ;; process the test_data table
+    (if (and test-id state status (equal? status "AUTO")) 
+	(db:test-data-rollup dbstruct run-id test-id status))
+    (mt:process-triggers run-id test-id state status)))
 
 ;; state is the priority rollup of all states
 ;; status is the priority rollup of all completed states
 ;;
 ;; if test-name is an integer work off that instead of test-name test-path
@@ -3174,11 +3184,11 @@
 ;;
 (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
   ;; establish info on incoming test followed by info on top level test
   (let* ((db           (db:dbdat-get-db (dbr:dbstruct-tmpdb dbstruct)))
 	 (testdat      (if (number? test-name)
-			   (db:get-test-info-by-id dbstruct run-id test-name)
+			   (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id
 			   (db:get-test-info       dbstruct run-id test-name item-path)))
 	 (test-id      (db:test-get-id testdat))
 	 (test-name    (if (number? test-name)
 			   (db:test-get-testname testdat)
 			   test-name))
@@ -3207,11 +3217,11 @@
 				      *common:std-statuses* >))
 		  (newstate          (if (null? all-curr-states) "NOT_STARTED" (car all-curr-states)))
 		  (newstatus         (if (null? all-curr-statuses) "n/a" (car all-curr-statuses))))
 	     ;; (print "Setting toplevel to: " newstate "/" newstatus)
 	     (db:test-set-state-status-by-id dbstruct run-id tl-test-id newstate newstatus #f)))))))
-        
+
 (define db:roll-up-pass-fail-counts db:set-state-status-and-roll-up-items)
 
 ;; call with state = #f to roll up with out accounting for state/status of this item
 ;;
 ;;    (define (db:roll-up-pass-fail-counts dbstruct run-id test-name item-path state status)
@@ -3315,11 +3325,11 @@
                                     ELSE status
                                     END WHERE id=?;") ;; DONE
 	'(test-set-log            "UPDATE tests SET final_logf=? WHERE id=?;")      ;; DONE
 	;; '(test-set-rundir-by-test-id "UPDATE tests SET rundir=? WHERE id=?")        ;; DONE
 	;; '(test-set-rundir         "UPDATE tests SET rundir=? AND testname=? AND item_path=?;") ;; DONE
-	'(test-set-rundir-shortdir "UPDATE tests SET rundir=?,shortdir=? WHERE testname=? AND item_path=?;")    ;; BROKEN!!! NEEDS run-id
+	'(test-set-rundir-shortdir "UPDATE tests SET rundir=?,shortdir=? WHERE testname=? AND item_path=? AND run_id=?;")    ;; BROKEN!!! NEEDS run-id
 	'(delete-tests-in-state   ;; "DELETE FROM tests WHERE state=?;")                  ;; DONE
 	  "UPDATE tests SET state='DELETED' WHERE state=?")
 	'(tests:test-set-toplog   "UPDATE tests SET final_logf=? WHERE run_id=? AND testname=? AND item_path='';")
 	'(update-cpuload-diskfree "UPDATE tests SET cpuload=?,diskfree=? WHERE id=?;") ;; DONE
 	'(update-uname-host       "UPDATE tests SET uname=?,host=? WHERE id=?;")       ;; DONE
@@ -3328,12 +3338,11 @@
 	;; stuff for roll-up-pass-fail-counts
 	'(update-pass-fail-counts "UPDATE tests 
              SET fail_count=(SELECT count(id) FROM tests WHERE testname=? AND item_path != '' AND status IN ('FAIL','CHECK','INCOMPLETE','ABORT')),
                  pass_count=(SELECT count(id) FROM tests WHERE testname=? AND item_path != '' AND status IN ('PASS','WARN','WAIVED'))
              WHERE testname=? AND item_path='' AND run_id=?;") ;; DONE  ;; BROKEN!!! NEEDS run-id
-	'(top-test-set          "UPDATE tests SET state=? WHERE testname=? AND item_path='';") ;; DONE   ;; BROKEN!!! NEEDS run-id
-	'(top-test-set-running  "UPDATE tests SET state='RUNNING' WHERE testname=? AND item_path='';") ;; DONE   ;; BROKEN!!! NEEDS run-id
+	'(top-test-set-running  "UPDATE tests SET state='RUNNING' WHERE testname=? AND item_path='' AND run_id=?;") ;; DONE   ;; BROKEN!!! NEEDS run-id
 
 
 	;; Might be the following top-test-set-per-pf-counts query could be better based off of something like this:
 	;;
 	;; select state,status,count(state) from tests where run_id=59 AND testname='runfirst' group by state,status;
@@ -3441,16 +3450,16 @@
 			       sync
 			       set-verbosity
 			       killserver
 			       ))
 
-(define (db:login dbstruct calling-path calling-version run-id client-signature)
+(define (db:login dbstruct calling-path calling-version client-signature)
   (cond 
    ((not (equal? calling-path *toppath*))
     (list #f "Login failed due to mismatch paths: " calling-path ", " *toppath*))
-   ((not (equal? *run-id* run-id))
-    (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*))
+   ;; ((not (equal? *run-id* run-id))
+   ;;  (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*))
    ((not (equal? megatest-version calling-version))
     (list #f "Login failed due to mismatch megatest version: " calling-version ", " megatest-version))
    (else
     (hash-table-set! *logged-in-clients* client-signature (current-seconds))
     '(#t "successful login"))))

Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ http-transport.scm
@@ -111,11 +111,11 @@
 ;; This is recursively run by http-transport:run until sucessful
 ;;
 (define (http-transport:try-start-server run-id ipaddrstr portnum server-id)
   (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))
 	(tdbdat          (tasks:open-db)))
-    (debug:print-info 0 *default-log-port* "http-transport:try-start-server run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname)
+    (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " run-id=" run-id " ipaddrsstr=" ipaddrstr " portnum=" portnum " server-id=" server-id " config-hostname=" config-hostname)
     (handle-exceptions
      exn
      (begin
        (print-error-message exn)
        (if (< portnum 64000)
@@ -260,11 +260,11 @@
 					   exn
 					   (begin
 					     (set! success #f)
 					     (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".")
 					     (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-					     (hash-table-delete! *runremote* run-id)
+					     (set! *runremote* #f) ;; (hash-table-delete! *runremote* run-id)
 					     ;; Killing associated server to allow clean retry.")
 					     ;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
 					     (mutex-unlock! *http-mutex*)
 					     ;;; (signal (make-composite-condition
 					     ;;;          (make-property-condition 'commfail 'message "failed to connect to server")))
@@ -307,11 +307,11 @@
 		       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))
 
 ;; careful closing of connections stored in *runremote*
 ;;
 (define (http-transport:close-connections run-id)
-  (let* ((server-dat (hash-table-ref/default *runremote* run-id #f)))
+  (let* ((server-dat *runremote*)) ;; (hash-table-ref/default *runremote* run-id #f)))
     (if (vector? server-dat)
 	(let ((api-dat (http-transport:server-dat-get-api-uri server-dat)))
 	  (close-connection! api-dat)
 	  #t)
 	#f)))
@@ -398,11 +398,13 @@
       (if *dbstruct-db* 
 	  (let ((start-time (current-milliseconds))
 		(sync-time  #f)
 		(rem-time   #f))
 	    (condition-case
-	     (db:sync-touched *dbstruct-db* *run-id* force-sync: #t)
+	     ;; (if (and (member (mutex-state *db-sync-mutex*) '(abandoned not-abandoned))
+	     ;;	      (> (- (current-seconds) *db-last-sync*) 5)) ;; if not currently being synced nor recently synced
+	     (db:sync-touched *dbstruct-db* *run-id* force-sync: #t) ;; usually done in the watchdog, not here.
 	     ((sync-failed)(cond
 			    ((> bad-sync-count 10) ;; time to give up
 			     (http-transport:server-shutdown server-id port))
 			    (else ;; (> bad-sync-count 0)  ;; we've had a fail or two, delay and loop
 			     (thread-sleep! 5)
@@ -428,11 +430,12 @@
 		(if (equal? new-server-id server-id)
 		    (begin
 		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
 		      (thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access
 		      (set! *dbstruct-db*  (db:setup)) ;;  run-id))
-		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running"))
+		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
+		      (server:write-dotserver *toppath* (conc iface ":" port)))
 		    (begin ;; gotta exit nicely
 		      (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
 		      (http-transport:server-shutdown server-id port))))))
       
       (if (< count 1) ;; 3x3 = 9 secs aprox
@@ -485,11 +488,11 @@
   
 (define (http-transport:server-shutdown server-id port)
   (let ((tdbdat (tasks:open-db)))
     (debug:print-info 0 *default-log-port* "Starting to shutdown the server.")
     ;; need to delete only *my* server entry (future use)
-    (if *dbstruct-db* (db:sync-touched *dbstruct-db* *run-id* force-sync: #t))
+    ;; (if *dbstruct-db* (db:sync-touched *dbstruct-db* *run-id* force-sync: #t)) ;; handled in the watchdog only
     (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up
     ;;
     ;; start_shutdown
     ;;
     (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down")
@@ -510,10 +513,12 @@
 			  (/ *total-non-write-delay* 
 			     *number-non-write-queries*))
 		      " ms")
     (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
     (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running complete")
+    ;; if the .server file contained :myport then we can remove it
+    (server:remove-dotserver-file *toppath* port)
     (exit)))
 
 ;; all routes though here end in exit ...
 ;;
 ;; start_server? 

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -896,11 +896,11 @@
 	 (lnkpathf  (conc lnkpath (if not-iterated "" "/") item-path))
 	 (lnktarget (conc lnkpath "/" item-path)))
 
     ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical
     ;;                                                 rundir   shortdir
-    (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path)
+    (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id)
 
     (debug:print 2 *default-log-port* "INFO:\n       lnkbase=" lnkbase "\n       lnkpath=" lnkpath "\n  toptest-path=" toptest-path "\n     test-path=" test-path)
     (if (not (file-exists? linktree))
 	(begin
 	  (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree)
@@ -969,11 +969,11 @@
 	  (rmt:general-call 'test-set-rundir-shortdir run-id lnkpath 
 			    (if (file-exists? lnkpath)
 				;; (resolve-pathname lnkpath)
 				(common:nice-path lnkpath)
 				lnkpath)
-			    testname "")
+			    testname "" run-id)
 	  ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path)
 	  (if (or (not curr-test-path)
 		  (not (directory-exists? toptest-path)))
 	      (begin
 		(debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -343,42 +343,11 @@
 (let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target"))))
   (if targ (setenv "MT_TARGET" targ)))
 
 ;; The watchdog is to keep an eye on things like db sync etc.
 ;;
-(define *watchdog*
-  (make-thread 
-   (lambda ()
-     (thread-sleep! 0.05) ;; delay for startup
-     (let ((legacy-sync (common:legacy-sync-required))
-	   (debug-mode  (debug:debug-mode 1))
-	   (last-time   (current-seconds)))
-       (if (common:legacy-sync-recommended)
-	   (let loop ()
-	     ;; sync for filesystem local db writes
-	     ;;
-             (let ((start-time   (current-seconds)))
-               ;; disabling for now (if legacy-sync (common:sync-to-megatest.db #f))
-	       (if (and debug-mode
-			(> (- start-time last-time) 60))
-		   (begin
-		     (set! last-time start-time)
-		     (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))
-	     
-	     ;; keep going unless time to exit
-	     ;;
-	     (if (not *time-to-exit*)
-		 (let delay-loop ((count 0))
-		   (if (and (not *time-to-exit*)
-			    (< count 11)) ;; aprox 5-6 seconds
-		       (begin
-			 (thread-sleep! 1)
-			 (delay-loop (+ count 1))))
-		   (loop)))
-	     (if (common:low-noise-print 30)
-		 (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*)))))
-     "Watchdog thread")))
+(define *watchdog* (make-thread common:watchdog "Watchdog thread"))
 
 (thread-start! *watchdog*)
 
 (if (args:get-arg "-log")
     (let ((oup (open-output-file (args:get-arg "-log"))))
@@ -673,13 +642,13 @@
       (if out-file (close-output-port out-port))
       (exit) ;; yes, bending the rules here - need to exit since this is a utility
       ))
 
 (if (args:get-arg "-ping")
-    (let* ((run-id        (string->number (args:get-arg "-run-id")))
+    (let* (;; (run-id        (string->number (args:get-arg "-run-id")))
 	   (host:port     (args:get-arg "-ping")))
-      (server:ping run-id host:port)))
+      (server:ping host:port)))
 
 ;;======================================================================
 ;; Capture, save and manipulate environments
 ;;======================================================================
 
@@ -2010,11 +1979,11 @@
 
 ;;======================================================================
 ;; Exit and clean up
 ;;======================================================================
 
-(if *runremote* (close-all-connections!))
+(if *runremote* (close-all-connections!)) ;; for http-client
 
 (if (not *didsomething*)
     (debug:print 0 *default-log-port* help))
 
 (set! *time-to-exit* #t)

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -40,11 +40,11 @@
 
 ;; if a server is either running or in the process of starting call client:setup
 ;; else return #f to let the calling proc know that there is no server available
 ;;
 (define (rmt:get-connection-info run-id)
-  (let ((cinfo (hash-table-ref/default *runremote* run-id #f)))
+  (let ((cinfo *runremote*)) ;; (hash-table-ref/default *runremote* run-id #f)))
     (if cinfo
 	cinfo
 	(if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id)
 	    (client:setup run-id)
 	    #f))))
@@ -54,10 +54,23 @@
 ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
 ;;
 (define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected
   ;; clean out old connections
   ;; (mutex-lock! *db-multi-sync-mutex*)
+
+  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in *runremote*
+  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
+  ;; 3. do the query, if on homehost use local access
+
+  (if (and #f  ;; FORCE NO GO FOR RIGHT NOW
+	   (not *runremote*)                         ;; we trust *runremote* to reflect that a server was found previously
+	   (not (member cmd api:read-only-queries))) ;; we don't trust so much the list of write queries
+      (let ((serverconn (server:check-if-running *toppath*)))
+	(if serverconn
+	    (set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed
+	    (server:kind-run *toppath*))))
+  
   (rmt:open-qry-close-locally cmd (if rid rid 0) params))
 
 ;;   (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin
 ;;     (for-each 
 ;;      (lambda (run-id)
@@ -217,15 +230,11 @@
     res))
 
 (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
   (let* ((qry-is-write   (not (member cmd api:read-only-queries)))
 	 (db-file-path   (db:dbfile-path)) ;;  0))
-	 (dbstruct-local (if *dbstruct-db*
-			     *dbstruct-db*
-			     (let* ((dbstruct (db:setup))) ;; make-dbr:dbstruct path:  dbdir local: #t)))
-			       (set! *dbstruct-db* dbstruct)
-			       dbstruct)))
+	 (dbstruct-local (db:setup))  ;; make-dbr:dbstruct path:  dbdir local: #t)))
 	 (read-only      (not (file-write-access? db-file-path)))
 	 (start          (current-milliseconds))
 	 (resdat         (if (not (and read-only qry-is-write))
 			     (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))
 			     (vector #t '())))
@@ -243,11 +252,11 @@
 	    (begin
 	      (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up")
 	      #f))
 	(begin
 	  ;; (rmt:update-db-stats run-id cmd params duration)
-	  ;; mark this run as dirty if this was a write
+	  ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
 	  (if qry-is-write
 	      (let ((start-time (current-seconds)))
 		(mutex-lock! *db-multi-sync-mutex*)
 		(hash-table-set! *db-local-sync* (or run-id 0) start-time) ;; the oldest "write"
                 (mutex-unlock! *db-multi-sync-mutex*)))))
@@ -304,13 +313,13 @@
   (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))
 
 ;; This login does no retries under the hood - it acts a bit like a ping.
 ;; Deprecated for nmsg-transport.
 ;;
-(define (rmt:login-no-auto-client-setup connection-info run-id)
-  (case *transport-type*
-    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*)))
+(define (rmt:login-no-auto-client-setup connection-info)
+  (case *transport-type* ;; run-id of 0 is just a placeholder
+    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*)))
     ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*)))
     ))
 
 ;; hand off a call to one of the db:queries statements
 ;; added run-id to make looking up the correct db possible 

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -104,43 +104,28 @@
 ;; Given a run id start a server process    ### NOTE ### > file 2>&1 
 ;; if the run-id is zero and the target-host is set 
 ;; try running on that host
 ;;   incidental: rotate logs in logs/ dir.
 ;;
-(define  (server:run run-id)
+(define  (server:run areapath) ;; areapath is ignored for now.
   (let* ((curr-host   (get-host-name))
 	 (curr-ip     (server:get-best-guess-address curr-host))
+	 (curr-pid    (current-process-id))
 	 (target-host (configf:lookup *configdat* "server" "homehost" ))
 	 (testsuite   (common:get-testsuite-name))
-	 (logfile     (conc *toppath* "/logs/" run-id ".log"))
+	 (logfile     (conc *toppath* "/logs/server-" curr-pid ".log"))
 	 (cmdln (conc (common:get-megatest-exe)
-		      " -server " (or target-host "-") " -run-id " run-id (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
+		      " -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
 									      (conc " -daemonize -log " logfile)
 									      "")
-		      " -m testsuite:" testsuite))) ;; (conc " >> " logfile " 2>&1 &")))))
-    (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...")
+		      " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &")))))
+	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")))
+    ;; we want the remote server to start in *toppath* so push there
     (push-directory *toppath*)
-    (if (not (directory-exists? "logs"))(create-directory "logs"))
-    
-    ;; Rotate logs, logic: 
-    ;;                 if > 500k and older than 1 week:
-    ;;                     remove previous compressed log and compress this log
-    ;;
-    (directory-fold 
-     (lambda (file rem)
-       (if (and (string-match "^.*.log" file)
-		(> (file-size (conc "logs/" file)) 200000))
-	   (let ((gzfile (conc "logs/" file ".gz")))
-	     (if (file-exists? gzfile)
-		 (begin
-		   (debug:print-info 0 *default-log-port* "removing " gzfile)
-		   (delete-file gzfile)))
-	     (debug:print-info 0 *default-log-port* "compressing " file)
-	     (system (conc "gzip logs/" file)))))
-     '()
-     "logs")
-    
+    (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...")
+    (thread-start! log-rotate)
+
     ;; host.domain.tld match host?
     (if (and target-host 
 	     ;; look at target host, is it host.domain.tld or ip address and does it 
 	     ;; match current ip or hostname
 	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
@@ -152,11 +137,11 @@
     (setenv "TARGETHOST_LOGF" logfile)
     (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever
     (system (conc "nbfake " cmdln))
     (unsetenv "TARGETHOST_LOGF")
     (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
-    ;; (system cmdln)
+    (thread-join! log-rotate)
     (pop-directory)))
 
 (define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
   (if *my-client-signature* *my-client-signature*
       (let ((sig (server:mk-signature)))
@@ -163,91 +148,114 @@
 	(set! *my-client-signature* sig)
 	*my-client-signature*)))
 
 ;; kind start up of servers, wait 40 seconds before allowing another server for a given
 ;; run-id to be launched
-(define (server:kind-run run-id)
-  (let ((last-run-time (hash-table-ref/default *server-kind-run* run-id #f)))
+(define (server:kind-run areapath)
+  (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f)))
     (if (or (not last-run-time)
 	    (> (- (current-seconds) last-run-time) 30))
 	(begin
-	  (server:run run-id)
-	  (hash-table-set! *server-kind-run* run-id (current-seconds))))))
+	  (server:run areapath)
+	  (hash-table-set! *server-kind-run* areapath (current-seconds))))))
 
 ;; The generic run a server command. Dispatches the call to server 0 if run-id != 0
 ;; 
 (define (server:try-running run-id)
   (if (eq? run-id 0)
       (server:run run-id)
       (rmt:start-server run-id)))
 
-(define (server:check-if-running run-id)
-  (let ((tdbdat (tasks:open-db)))
-    (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id))
-	       (trycount 0))
-    (if server
-	;; note: client:start will set *runremote*. this needs to be changed
-	;;       also, client:start will login to the server, also need to change that.
-	;;
-	;; client:start returns #t if login was successful.
-	;;
-	(let ((res (case *transport-type*
-		     ((http)(server:ping-server run-id 
-						(tasks:hostinfo-get-interface server)
-						(tasks:hostinfo-get-port      server)))
-		     ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
-		     ;;    			 (tasks:hostinfo-get-port      server)
-		     ;;    			 timeout: 2))
-                     )))
-	  ;; if the server didn't respond we must remove the record
+(define (server:read-dotserver areapath)
+  (let ((dotfile (conc areapath "/.server")))
+    (if (and (file-exists? dotfile)
+	     (file-read-access? dotfile))
+	(with-input-from-file
+	    dotfile
+	  (lambda ()
+	    (read-line)))
+	#f)))
+
+;; write a .server file in *toppath* with hostport
+;; return #t on success, #f otherwise
+;;
+(define (server:write-dotserver areapath hostport)
+  (let ((lock-file   (conc areapath "/.server.lock"))
+	(server-file (conc areapath "/.server")))
+    (if (common:simple-file-lock lock-file)
+	(let ((res (handle-exceptions
+		    exn
+		    #f ;; failed for some reason, for the moment simply return #f
+		    (with-output-to-file server-file
+		      (lambda ()
+			(print hostport)))
+		    #t)))
+	  (common:simple-file-release-lock lock-file)
+	  res)
+	#f)))
+
+(define (server:remove-dotserver-file areapath hostport)
+  (let ((dotserver   (server:read-dotserver areapath))
+	(server-file (conc areapath "/.server"))
+	(lock-file   (conc areapath "/.server.lock")))
+    (if (string-match (conc ".*:" hostport "$") dotserver) ;; port matches, good enough info to decide to remove the file
+	(if (common:simple-file-lock lock-file)
+	    (begin
+	      (handle-exceptions
+	       exn
+	       #f
+	       (delete-file* server-file))
+	      (common:simple-file-release-lock lock-file))))))
+
+;; no longer care if multiple servers are started by accident. older servers will drop off in time.
+;;
+(define (server:check-if-running areapath)
+  (let* ((dotserver (server:read-dotserver areapath))) ;; tdbdat (tasks:open-db)))
+    (if dotserver
+	(let* ((res (case *transport-type*
+		      ((http)(server:ping-server dotserver))
+		      ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
+		      )))
 	  (if res
-	      #t
-	      (begin
-		(debug:print-info 0 *default-log-port* "server at " server " not responding, removing record")
-		(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id 
-				" server:check-if-running")
-		res)))
-	#f))))
+	      dotserver
+	      #f))
+	#f)))
 
 ;; called in megatest.scm, host-port is string hostname:port
 ;;
-(define (server:ping run-id host:port)
+;; NOTE: This is NOT called directly from clients as not all transports support a client running
+;;       in the same process as the server.
+;;
+(define (server:ping host:port)
   (let ((tdbdat (tasks:open-db)))
     (let* ((host-port (let ((slst (string-split   host:port ":")))
 			(if (eq? (length slst) 2)
 			    (list (car slst)(string->number (cadr slst)))
 			    #f)))
-	   (toppath       (launch:setup))
-	   (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f)))
-      (if (not run-id)
-	  (begin
-	    (debug:print-error 0 *default-log-port* "must specify run-id when doing ping, -run-id n")
-	    (print "ERROR: No run-id")
-	    (exit 1))
-	  (if (and (not host-port)
-		   (not server-db-dat))
-	      (begin
-		(print "ERROR: bad host:port")
-		(exit 1))
-	      (let* ((iface      (if host-port (car host-port) (tasks:hostinfo-get-interface server-db-dat)))
-		     (port       (if host-port (cadr host-port)(tasks:hostinfo-get-port      server-db-dat)))
-		     (server-dat (http-transport:client-connect iface port))
-		     (login-res  (rmt:login-no-auto-client-setup server-dat run-id)))
-		(if (and (list? login-res)
-			 (car login-res))
-		    (begin
-		      (print "LOGIN_OK")
-		      (exit 0))
-		    (begin
-		      (print "LOGIN_FAILED")
-		      (exit 1)))))))))
+	   (toppath       (launch:setup)))
+      (if (not host-port)
+	  (begin
+	    (print "ERROR: bad host:port")
+	    (exit 1))
+	  (let* ((iface      (if host-port (car host-port) (tasks:hostinfo-get-interface server-db-dat)))
+		 (port       (if host-port (cadr host-port)(tasks:hostinfo-get-port      server-db-dat)))
+		 (server-dat (http-transport:client-connect iface port))
+		 (login-res  (rmt:login-no-auto-client-setup server-dat)))
+	    (if (and (list? login-res)
+		     (car login-res))
+		(begin
+		  (print "LOGIN_OK")
+		  (exit 0))
+		(begin
+		  (print "LOGIN_FAILED")
+		  (exit 1))))))))
 
 ;; run ping in separate process, safest way in some cases
 ;;
-(define (server:ping-server run-id iface port)
+(define (server:ping-server ifaceport)
   (with-input-from-pipe 
-   (conc (common:get-megatest-exe) " -run-id " run-id " -ping " (conc iface ":" port))
+   (conc (common:get-megatest-exe) " -ping " ifaceport)
    (lambda ()
      (let loop ((inl (read-line))
 		(res "NOREPLY"))
        (if (eof-object? inl)
 	   (case (string->symbol res)
@@ -258,16 +266,12 @@
 
 (define (server:login toppath)
   (lambda (toppath)
     (set! *last-db-access* (current-seconds))
     (if (equal? *toppath* toppath)
-	(begin
-	  ;; (debug:print-info 2 *default-log-port* "login successful")
-	  #t)
-	(begin
-	  ;; (debug:print-info 2 *default-log-port* "login failed")
-	  #f))))
+	#t
+	#f)))
 
 (define (server:get-timeout)
   (let ((tmo (configf:lookup  *configdat* "server" "timeout")))
     (if (and (string? tmo)
 	     (string->number tmo))

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -397,17 +397,17 @@
 
     ;; update the primary record IF state AND status are defined
     (if (and state status)
 	(begin
 	  (rmt:test-set-status-state run-id test-id real-status state (if waived waived comment))
-	  (mt:process-triggers run-id test-id state real-status)
+	  ;; (mt:process-triggers run-id test-id state real-status) ;; triggers are called in test-set-status-state
 	  ))
     
     ;; if status is "AUTO" then call rollup (note, this one modifies data in test
     ;; run area, it does remote calls under the hood.
-    (if (and test-id state status (equal? status "AUTO")) 
-	(rmt:test-data-rollup run-id test-id status))
+    ;; (if (and test-id state status (equal? status "AUTO")) 
+    ;; 	(rmt:test-data-rollup run-id test-id status))
 
     ;; add metadata (need to do this way to avoid SQL injection issues)
 
     ;; :first_err
     ;; (let ((val (hash-table-ref/default otherdat ":first_err" #f)))
@@ -481,13 +481,12 @@
 	      (lockf         (conc outputfilename ".lock")))
 	  (let loop ((have-lock  (common:simple-file-lock lockf)))
 	    (if have-lock
 		(let ((script (configf:lookup *configdat* "testrollup" test-name)))
 		  (print "Obtained lock for " outputfilename)
-		  ;; (rmt:top-test-set-per-pf-counts run-id test-name)
 		  (rmt:roll-up-pass-fail-counts run-id test-name "" #f #f #f)
-		  (rmt:top-test-set-per-pf-counts run-id test-name)
+		  ;; (rmt:test-set-status-state run-id test-name #f #f #f) ;; (rmt:top-test-set-per-pf-counts run-id test-name)
 		  (if script
 		      (system (conc script " > " outputfilename " & "))
 		      (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename))
 		  (common:simple-file-release-lock lockf)
 		  (change-directory orig-dir)