Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -371,11 +371,13 @@
     ((add-var)                      (apply db:add-var dbstruct params))
 
     ((insert-run)                   (apply db:insert-run dbstruct params))
 
     ;; STEPS
-    ((teststep-set-status!)         (apply db:teststep-set-status! dbstruct params))
+    ((teststep-set-status!)
+     ;; (apply db:teststep-set-status! dbstruct params))
+     (db:add-cached-write dbstruct db:teststep-set-status! run-id params))
     ((delete-steps-for-test!)       (apply db:delete-steps-for-test! dbstruct params))
     
     ;; TEST DATA
     ((test-data-rollup)             (apply db:test-data-rollup dbstruct params))
     ((csv->test-data)               (apply db:csv->test-data dbstruct params))

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -2739,12 +2739,10 @@
       "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);"
       test-id teststep-name state-in status-in (current-seconds)
       (if comment comment "")
       (if logfile logfile "")))))
 
-
-
 (define (db:delete-steps-for-test! dbstruct run-id test-id)
   ;; TODO: figure out why status is the key field rather than state (note:  CONSTRAINT test_steps_constraint UNIQUE (test_id,stepname,state) )
   (db:with-db
    dbstruct
    run-id
@@ -4247,10 +4245,56 @@
 	   (conc (current-directory) "/" outputfile)))
      results)
     ;; brutal clean up
     (dbfile:add-dbdat dbstruct #f dbdat)
     (system "rm -rf tempdir")))
+
+;;======================================================================
+;; cached writes stuff
+;;======================================================================
+
+(define (db:add-cached-write dbstruct proc run-id params)
+  (debug:print 0 *default-log-port* "Adding cached write for run-id "run-id" params " params)
+  (mutex-lock! *cached-writes-mutex*)
+  (let* ((hkey                (cons dbstruct run-id))
+	 (cached-writes-queue (hash-table-ref/default *cached-writes-queues* hkey '())))
+    (hash-table-set! *cached-writes-queues* hkey (cons (list proc params) cached-writes-queue)))
+  (if (not *cached-writes-flag*)
+      (begin
+	(set! *cached-writes-flag* #t)
+	(thread-start! (make-thread
+			(lambda ()
+			  (debug:print 0 *default-log-port* "process cached writes thread started.")
+			  (thread-sleep! 1)
+			  (db:process-cached-writes-queue))))))
+  (mutex-unlock! *cached-writes-mutex*))
+
+(define (db:process-cached-writes-queue)
+  (mutex-lock! *cached-writes-mutex*)
+  (hash-table-for-each
+   *cached-writes-queues*
+   (lambda (hkey writes-list)
+     (let* ((dbstruct (car hkey))
+	    (run-id   (cdr hkey)))
+       (debug:print 0 *default-log-port* "Processing "(length writes-list)" cached writes for run "run-id)
+       (db:with-db
+	dbstruct
+	run-id
+	#t 
+	(lambda (dbdat db)
+	  (sqlite3:with-transaction
+	   db
+	   (lambda ()
+	     (for-each
+	      (lambda (queued-write)
+		(match queued-write
+		       ((proc params)(apply proc dbstruct params))
+		       (else (assert #f "BAD queued-write"))))
+	      writes-list)))
+	  (hash-table-delete! *cached-writes-queues* hkey))))))
+  (set! *cached-writes-flag* #f)
+  (mutex-unlock! *cached-writes-mutex*))
 
 ;; (db:extract-ods-file db "outputfile.ods" '(("sysname" "%")("fsname" "%")("datapath" "%")) "%")
 
 ;;======================================================================
 ;; moving watch dogs here due to dependencies

Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -1579,7 +1579,17 @@
 			    ;; (db:hoh-set! stmt-cache db stmt newstmth)
 			    (hash-table-set! stmt-cache stmt newstmth)
 			    newstmth))))
     (mutex-unlock! *get-cache-stmth-mutex*)
     result))
+
+;;======================================================================
+;; cached writes - run list of procs inside transaction
+;;   NOTE: this only works because we have once database per process
+;;======================================================================
+
+(define *cached-writes-mutex* (make-mutex))
+(define *cached-writes-flag*  #f)
+(define *cached-writes-queues* (make-hash-table)) ;; dbstruct->list of writes
+
 
 )

Index: dbmod.scm
==================================================================
--- dbmod.scm
+++ dbmod.scm
@@ -119,12 +119,14 @@
 			      (loop (- count 1)))
 			    (begin
 			      (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, giving up.")
 			      (exit 1))))
 		   (exn ()
-			(dbfile:print-err exn "ERROR: Unknown error with database for run-id "run-id", message: "
-					  ((condition-property-accessor 'exn 'message) exn))
+			(dbfile:print-err exn "ERROR: Unknown error with db for run-id "
+					  run-id", message: "
+					  ((condition-property-accessor 'exn 'message) exn)
+					  ", details: "(condition->list exn))
 			(exit 2))))))
       (if use-mutex (mutex-unlock! *db-with-db-mutex*))
       res)))
 
 (define (db:with-db dbstruct run-id w/r proc . params)

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -72,34 +72,11 @@
 
 ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
 ;;
 (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
   (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
-
-  (if (not (eq? (rmt:transport-mode) 'nfs))
-      (begin
-	(if (> attemptnum 2)
-	    (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
-	
-	(cond
-	 ((> attemptnum 2) (thread-sleep! 0.05))
-	 ((> attemptnum 10) (thread-sleep! 0.5))
-	 ((> attemptnum 20) (thread-sleep! 1)))
-	
-	;; I'm turning this off, it may make sense to move it
-	;; into http-transport-handler
-	(if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
-	    (begin
-	      (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.")
-	      (case (rmt:transport-mode)
-		((http)
-		 (server:run *toppath*)
-		 (thread-sleep! 3))
-		(else
-		 (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server
-		 ))))))
-  
+	
   ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
   ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
   ;; 3. do the query, if on homehost use local access
   ;;
   (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value

Index: tcp-transportmod.scm
==================================================================
--- tcp-transportmod.scm
+++ tcp-transportmod.scm
@@ -232,10 +232,11 @@
 ;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
 ;;
 (define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
   (debug:print 2 *default-log-port* "tt:handler cmd: " cmd " run-id: " run-id " attemptnum: " attemptnum)
   ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
+  ;; connect-to-server will start a server if needed.
   (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
     (if conn
 	;; have connection, call the server
 	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
 	  ;; res is (status errmsg result meta)
@@ -260,11 +261,11 @@
 		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
 		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
 	       (else
 		result)))
 	    (else ;; did not receive properly formated result
-	     (if (not res) ;; tt:handler is telling us that communication failed
+	     (if (not res) ;; tt:send-receive telling us that communication failed
 		 (let* ((host    (tt-conn-host conn))
 			(port    (tt-conn-port conn))
 			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
 			(pid     (tt-conn-pid  conn))
                         ;;(servinf (tt-conn-servinf-file conn))) 
@@ -293,11 +294,11 @@
 				     ;; try again
 				     (thread-sleep! 0.25) ;; dunno, I think this needs to be here
 				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
 				   ))))
 		       (begin ;; no server file, delay and try again
-			 (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
+			 (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ")
 			 (thread-sleep! 0.5)
 			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
 		 (begin ;; this case is where res is malformed. Probably should abort
 		   (assert #f "FATAL: tt:handler received bad data "res)
 		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
@@ -476,11 +477,12 @@
   ;; is there already a server for this dbfile? Then exit.
   (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in)
   (let* ((ttdat   (make-tt areapath: areapath))
 	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
 	 (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
-    (if (> (length servers) 4)
+         (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname)
+    (if (> (length servers) 0)
 	(begin
 	  (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
 	  (exit))
 	(let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
 	  (tt-handler-set! ttdat (handler dbstruct))
@@ -537,12 +539,11 @@
 		    (dbfile:with-no-sync-db nosyncdbpath
 					    (lambda (db)
 					      (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
 						(debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
 						(db:no-sync-del! db dbfname)
-						#;(if dbtmpname
-						    (delete-file dbtmpname))))))))
+                                                ))))))
     (set! *server-info* ttdat)
     (let loop ((count 0))
       (if (> count 240)
 	  (begin
 	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
@@ -585,10 +586,11 @@
 			  (if (and res (common:low-noise-print 120 "top server message"))
 			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
 						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
 			  res))
 		       (else
+                        ;; wrong servinfo file
 			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
 			(let* ((leadsrv (car servers)))
 			  (match leadsrv
 			    ((host port startseconds server-id pid dbfname servinfofile)
 			     (let* ((result  (tt:timed-ping host port server-id))