Index: Makefile
==================================================================
--- Makefile
+++ Makefile
@@ -28,16 +28,27 @@
            ezsteps.scm lock-queue.scm rmt.scm api.scm		\
            subrun.scm portlogger.scm archive.scm env.scm		\
            diff-report.scm cgisetup/models/pgdb.scm
 
 # module source files
-MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm
+MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm \
+            tcp-transportmod.scm
 
 all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut tcmt
 
+transport-mode.scm : transport-mode.scm.template
+	cp transport-mode.scm.template transport-mode.scm
+
+dashboard-transport-mode.scm : dashboard-transport-mode.scm.template
+	cp dashboard-transport-mode.scm.template dashboard-transport-mode.scm
+
+megatest.scm : transport-mode.scm
+dashboard.scm : dashboard-transport-mode.scm
+
 # dbmod.import.o is just a hack here
 mofiles/dbfile.o     : mofiles/debugprint.o mofiles/commonmod.o dbmod.import.o
+configf.o : commonmod.import.o
 db.o : dbmod.import.o
 mofiles/debugprint.o : mofiles/mtargs.o
 
 # ftail.scm rmtmod.scm commonmod.scm removed
 # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm	\
@@ -96,12 +107,12 @@
 	@echo $(MTESTHASH)
 
 dboard : $(OFILES) $(GOFILES) dashboard.scm $(MOFILES) $(MOIMPFILES) megatest-version.scm megatest-fossil-hash.scm
 	csc $(CSCOPTS) $(OFILES) dashboard.scm $(GOFILES) $(MOFILES) $(MOIMPFILES) -o dboard
 
-mtut: $(OFILES) $(MOFILES) megatest-fossil-hash.scm mtut.scm megatest-version.scm
-	csc $(CSCOPTS) $(OFILES) $(MOFILES) mtut.scm -o mtut
+mtut: $(OFILES) $(MOFILES) $(MOIMPFILES) megatest-fossil-hash.scm mtut.scm megatest-version.scm 
+	csc $(CSCOPTS) $(OFILES) $(MOFILES)  $(MOIMPFILES) mtut.scm -o mtut
 
 # include makefile.inc
 
 TCMTOBJS = \
 	api.o \
@@ -359,22 +370,22 @@
 	$(INSTALL) lib/libxcb-xlib.so.0 $(PREFIX)/bin/.$(ARCHSTR)/lib/libxcb-xlib.so.0; \
         fi
 
 install : $(PREFIX)/bin/.$(ARCHSTR) $(PREFIX)/bin/.$(ARCHSTR)/mtest $(PREFIX)/bin/megatest \
           $(PREFIX)/bin/.$(ARCHSTR)/dboard $(PREFIX)/bin/dashboard $(HELPERS) $(PREFIX)/bin/nbfake \
-          $(PREFIX)/bin/.$(ARCHSTR)/mtexec $(PREFIX)/bin/mtexec $(PREFIX)/bin/serialize-env \
 	  $(PREFIX)/bin/nbfind $(PREFIX)/bin/mtrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \
           $(PREFIX)/bin/mt-old-to-new.sh $(PREFIX)/bin/mt-new-to-old.sh \
 	  $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun \
 	  $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/mtutil \
 	  $(PREFIX)/bin/tcmt $(PREFIX)/share/db/mt-pg.sql \
           $(PREFIX)/share/js/jquery-3.1.0.slim.min.js \
           $(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so \
           $(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so.0 \
           $(PREFIX)/bin/.$(ARCHSTR)/lib/libxcb-xlib.so.0
+
 #         $(PREFIX)/bin/.$(ARCHSTR)/ndboard
-
+#         $(PREFIX)/bin/.$(ARCHSTR)/mtexec $(PREFIX)/bin/mtexec $(PREFIX)/bin/serialize-env \
 # $(PREFIX)/bin/newdashboard
 
 $(PREFIX)/bin/.$(ARCHSTR) : 
 	mkdir -p $(PREFIX)/bin/.$(ARCHSTR)
 	mkdir -p $(PREFIX)/bin/.$(ARCHSTR)/lib

Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,13 @@
 #     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 TODO
 ====
 
+23WW07
+. Remove use of *dbstruct-dbs*
+
 WW15
 . fill newview matrix with data, filter pipeline gui elements
 . improve [script], especially indent handling
 
 WW16

Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -18,21 +18,26 @@
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 ;;======================================================================
 
-(use srfi-69 posix)
-
 (declare (unit api))
 (declare (uses rmt))
 (declare (uses db))
 (declare (uses dbmod))
 (declare (uses dbfile))
 (declare (uses tasks))
+(declare (uses tcp-transportmod))
 
 (import dbmod)
 (import dbfile)
+(import tcp-transportmod)
+
+(use srfi-69
+     posix
+     matchable
+     s11n)
 
 ;; allow these queries through without starting a server
 ;;
 (define api:read-only-queries
   '(get-key-val-pairs
@@ -39,10 +44,11 @@
     get-var
     get-keys
     get-key-vals
     test-toplevel-num-items
     get-test-info-by-id
+    get-test-state-status-by-id
     get-steps-info-by-id
     get-data-info-by-id
     test-get-rundir-from-test-id
     get-count-tests-running-for-testname
     get-count-tests-running
@@ -59,12 +65,12 @@
     get-run-info
     get-run-status
     get-run-state
     get-run-stats
     get-run-times
-    get-targets
     get-target
+    get-targets
     ;; register-run
     get-tests-tags
     get-test-times
     get-tests-for-run
     get-tests-for-run-state-status
@@ -141,262 +147,318 @@
     tasks-add
     tasks-set-state-given-param-key
     ))
 
 (define *db-write-mutexes* (make-hash-table))
-
+(define *server-signature* #f)
 ;; These are called by the server on recipt of /api calls
 ;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
 ;;
 ;;    - returns #( flag result )
 ;;
 (define (api:execute-requests dbstruct dat)
   (db:open-no-sync-db) ;; sets *no-sync-db*
-;;   (handle-exceptions
-;;    exn
-;;    (let ((call-chain (get-call-chain)))
-;;      (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
-;;      (print-call-chain (current-error-port))
-;;      (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
+  ;;   (handle-exceptions
+  ;;    exn
+  ;;    (let ((call-chain (get-call-chain)))
+  ;;      (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
+  ;;      (print-call-chain (current-error-port))
+  ;;      (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
   ;;      (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
   (if (> *api-process-request-count* 200)
       (begin
 	(if (common:low-noise-print 30 "too many threads")
 	    (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay."))
 	(thread-sleep! 0.5) ;; take a nap
 	))
-   (cond
-    ((not (vector? dat))                    ;; it is an error to not receive a vector
-     (vector #f (vector #f "remote must be called with a vector")))
-    #;((> *api-process-request-count* 200) ;; 20)
-     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
-     (set! *server-overloaded* #t)
-     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
-    (else  
-     (let* ((cmd-in            (vector-ref dat 0))
-            (cmd               (if (symbol? cmd-in)
-				   cmd-in
-				   (string->symbol cmd-in)))
-            (params            (vector-ref dat 1))
-	    (run-id            (if (null? params)
-				   0
-				   (car params)))
-	    (write-mutex       (if (hash-table-exists? *db-write-mutexes* run-id)
-				   (hash-table-ref *db-write-mutexes* run-id)
-				   (let* ((newmutex (make-mutex)))
-				     (hash-table-set! *db-write-mutexes* run-id newmutex)
-				     newmutex)))
-            (start-t           (current-milliseconds))
-            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
-            (readonly-command  (member cmd api:read-only-queries))
-            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))))
-       (if (not readonly-command)
-	   (mutex-lock! write-mutex))
-       (let* ((res    
-               (if writecmd-in-readonly-mode
-                   (conc "attempt to run write command "cmd" on a read-only database")
-                   (case cmd
-                     ;;===============================================
-                     ;; READ/WRITE QUERIES
-                     ;;===============================================
-
-                     ((get-keys-write)                        (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl
-                     
-                     ;; SERVERS
-                     ((start-server)                    (apply server:kind-run params))
-                     ((kill-server)                     (set! *server-run* #f))
-
-                     ;; TESTS
-
-                     ;;((test-set-state-status-by-id)     (apply mt:test-set-state-status-by-id dbstruct params))
-                     ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items.
-                     ((test-set-state-status-by-id)
-
-                      ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
-                      (db:set-state-status-and-roll-up-items
-                       dbstruct
-                       (list-ref params 0) ; run-id
-                       (list-ref params 1) ; test-name
-                       #f                  ; item-path
-                       (list-ref params 2) ; state
-                       (list-ref params 3) ; status
-                       (list-ref params 4) ; comment
-                       ))
-                     
-                     ((delete-test-records)             (apply db:delete-test-records dbstruct params))
-                     ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params))
-                     ((test-set-state-status)           (apply db:test-set-state-status dbstruct params))
-                     ((test-set-top-process-pid)        (apply db:test-set-top-process-pid dbstruct params))
-                     ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params))
-                     ((set-state-status-and-roll-up-run) (apply db:set-state-status-and-roll-up-run dbstruct params)) 
-                     ((top-test-set-per-pf-counts)      (apply db:top-test-set-per-pf-counts dbstruct params))
-                     ((test-set-archive-block-id)       (apply db:test-set-archive-block-id dbstruct params))
-
-                     ;; RUNS
-                     ((register-run)                 (apply db:register-run dbstruct params))
-                     ((set-tests-state-status)       (apply db:set-tests-state-status dbstruct params))
-                     ((delete-run)                   (apply db:delete-run dbstruct params))
-                     ((lock/unlock-run)              (apply db:lock/unlock-run dbstruct params))
-                     ((update-run-event_time)        (apply db:update-run-event_time dbstruct params))
-                     ((update-run-stats)             (apply db:update-run-stats dbstruct params))
-                     ((set-var)                      (apply db:set-var dbstruct params))
-                     ((inc-var)                      (apply db:inc-var dbstruct params))
-		     ((dec-var)                      (apply db:dec-var dbstruct params))
-                     ((del-var)                      (apply db:del-var dbstruct params))
-		     ((add-var)                      (apply db:add-var dbstruct params))
-
-                     ;; STEPS
-                     ((teststep-set-status!)         (apply db:teststep-set-status! dbstruct params))
-                     ((delete-steps-for-test!)       (apply db:delete-steps-for-test! dbstruct params))
-                     
-                     ;; TEST DATA
-                     ((test-data-rollup)             (apply db:test-data-rollup dbstruct params))
-                     ((csv->test-data)               (apply db:csv->test-data dbstruct params))
-
-                     ;; MISC
-                     ((sync-inmem->db)               (let ((run-id (car params)))
-                                                       (db:sync-touched dbstruct run-id db:initialize-main-db force-sync: #t)))
-                     ((mark-incomplete)              (apply db:find-and-mark-incomplete dbstruct params))
-                     ((create-all-triggers)          (db:create-all-triggers dbstruct))
-                     ((drop-all-triggers)            (db:drop-all-triggers dbstruct)) 
-
-                     ;; TESTMETA
-                     ((testmeta-add-record)       (apply db:testmeta-add-record dbstruct params))
-                     ((testmeta-update-field)     (apply db:testmeta-update-field dbstruct params))
-                     ((get-tests-tags)            (db:get-tests-tags dbstruct))
-
-                     ;; TASKS
-                     ((tasks-add)                 (apply tasks:add dbstruct params))   
-                     ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params))
-                     ((tasks-get-last)            (apply tasks:get-last dbstruct params))
-
-		     ;; NO SYNC DB
-		     ((no-sync-set)               (apply db:no-sync-set         *no-sync-db* params))
-		     ((no-sync-get/default)       (apply db:no-sync-get/default *no-sync-db* params))
-		     ((no-sync-del!)              (apply db:no-sync-del!        *no-sync-db* params))
-		     ((no-sync-get-lock)          (apply db:no-sync-get-lock    *no-sync-db* params))
-		     
-                     ;; ARCHIVES
-                     ;; ((archive-get-allocations)   
-                     ((archive-register-disk)     (apply db:archive-register-disk dbstruct params))
-                     ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params))
-                     ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey))
-
-                     ;;======================================================================
-                     ;; READ ONLY QUERIES
-                     ;;======================================================================
-
-                     ;; KEYS
-                     ((get-key-val-pairs)               (apply db:get-key-val-pairs dbstruct params))
-                     ((get-keys)                        (db:get-keys dbstruct))
-                     ((get-key-vals)                    (apply db:get-key-vals dbstruct params))
-                     ((get-target)                      (apply db:get-target dbstruct params))
-                     ((get-targets)                     (db:get-targets dbstruct))
-
-                     ;; ARCHIVES
-                     ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
-                     
-                     ;; TESTS
-                     ((test-toplevel-num-items)         (apply db:test-toplevel-num-items dbstruct params))
-                     ((get-test-info-by-id)	       (apply db:get-test-info-by-id dbstruct params))
-                     ((test-get-rundir-from-test-id)    (apply db:test-get-rundir-from-test-id dbstruct params))
-                     ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params))
-                     ((get-count-tests-running)         (apply db:get-count-tests-running dbstruct params))
-                     ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params))
-                     ;; ((delete-test-step-records)        (apply db:delete-test-step-records dbstruct params))
-                     ;; ((get-previous-test-run-record)    (apply db:get-previous-test-run-record dbstruct params))
-                     ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params))
-                     ((test-get-logfile-info)           (apply db:test-get-logfile-info dbstruct params))
-                     ((test-get-records-for-index-file)  (apply db:test-get-records-for-index-file dbstruct params))
-                     ((get-testinfo-state-status)       (apply db:get-testinfo-state-status dbstruct params))
-                     ((test-get-top-process-pid)        (apply db:test-get-top-process-pid dbstruct params))
-                     ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params))
-                     ((get-prereqs-not-met)             (apply db:get-prereqs-not-met dbstruct params))
-                     ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params))
-                     ((get-not-completed-cnt)           (apply db:get-not-completed-cnt  dbstruct params)) 
-                     ;; ((synchash-get)                    (apply synchash:server-get dbstruct params))
-                     ((get-raw-run-stats)               (apply db:get-raw-run-stats dbstruct params))
-		     ((get-test-times)                  (apply db:get-test-times dbstruct params))
-
-                     ;; RUNS
-                     ((get-run-info)                 (apply db:get-run-info dbstruct params))
-                     ((get-run-status)               (apply db:get-run-status dbstruct params))
-                     ((get-run-state)                (apply db:get-run-state dbstruct params))
-                     ((set-run-status)               (apply db:set-run-status dbstruct params))
-                     ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
-                     ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db  dbstruct params)) 
-                     ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
-                     ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params))
-                     ((get-test-id)                  (apply db:get-test-id dbstruct params))
-                     ((get-tests-for-run-mindata)    (apply db:get-tests-for-run-mindata dbstruct params))
-                     ;; ((get-tests-for-runs-mindata)   (apply db:get-tests-for-runs-mindata dbstruct params))
-                     ((get-runs)                     (apply db:get-runs dbstruct params))
-                     ((simple-get-runs)              (apply db:simple-get-runs dbstruct params))
-                     ((get-num-runs)                 (apply db:get-num-runs dbstruct params))
-                     ((get-runs-cnt-by-patt)         (apply db:get-runs-cnt-by-patt dbstruct params))
-                     ((get-all-run-ids)              (db:get-all-run-ids dbstruct))
-                     ((get-prev-run-ids)             (apply db:get-prev-run-ids dbstruct params))
-                     ((get-run-ids-matching-target)  (apply db:get-run-ids-matching-target dbstruct params))
-                     ((get-runs-by-patt)             (apply db:get-runs-by-patt dbstruct params))
-                     ((get-run-name-from-id)         (apply db:get-run-name-from-id dbstruct params))
-                     ((get-main-run-stats)           (apply db:get-main-run-stats dbstruct params))
-                     ((get-var)                      (apply db:get-var dbstruct params))
-                     ((get-run-stats)                (apply db:get-run-stats dbstruct params))
-                     ((get-run-times)                (apply db:get-run-times dbstruct params)) 
-
-                     ;; STEPS
-                     ((get-steps-data)               (apply db:get-steps-data dbstruct params))
-                     ((get-steps-for-test)           (apply db:get-steps-for-test dbstruct params))
-		     ((get-steps-info-by-id)         (apply db:get-steps-info-by-id dbstruct params))
-
-                     ;; TEST DATA
-                     ((read-test-data)               (apply db:read-test-data dbstruct params))
-                     ((read-test-data-varpatt)       (apply db:read-test-data-varpatt dbstruct params))
-                     ((get-data-info-by-id)          (apply db:get-data-info-by-id dbstruct params)) 
-
-                     ;; MISC
-                     ((get-latest-host-load)         (apply db:get-latest-host-load dbstruct params))
-                     ((have-incompletes?)            (apply db:have-incompletes? dbstruct params))
-                     ((login)                        (apply db:login dbstruct params))
-                     ((general-call)                 (let ((stmtname   (car params))
-                                                           (run-id     (cadr params))
-                                                           (realparams (cddr params)))
-                                                       (db:general-call dbstruct run-id stmtname realparams)))
-                     ((sdb-qry)                      (apply sdb:qry params))
-                     ((ping)                         (current-process-id))
-		     ((get-changed-record-ids)       (apply db:get-changed-record-ids dbstruct params))
-		     ((get-run-record-ids) 	   (apply db:get-run-record-ids dbstruct params))	
-                     ;; TESTMETA
-                     ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))
-
-                     ;; TASKS 
-                     ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
-		     (else
-		      (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
-		      (conc "ERROR: BAD api call " cmd))))))
-	 (if (not readonly-command)
-	     (mutex-unlock! write-mutex))
-	 
-	 ;; save all stats
-	 (let ((delta-t (- (current-milliseconds)
-			   start-t))
-	       (modified-cmd (if (eq? cmd 'general-call)
-				 (string->symbol (conc "general-call-" (car params)))
-				 cmd)))
-	   (hash-table-set! *db-api-call-time* modified-cmd
-			    (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '()))))
-	 (if writecmd-in-readonly-mode
-             (begin
-               #;(common:telemetry-log (conc "api-out:"(->string cmd))
-               payload: `((params . ,params)
-               (ok-res . #t)))
-	       (vector #f res))
-             (begin
-               #;(common:telemetry-log (conc "api-out:"(->string cmd))
-               payload: `((params . ,params)
-               (ok-res . #f)))
-               (vector #t res))))))))
+  (cond
+   ((not (vector? dat))                    ;; it is an error to not receive a vector
+    (vector #f (vector #f "remote must be called with a vector")))
+   #;((> *api-process-request-count* 200) ;; 20)
+   (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
+   (set! *server-overloaded* #t)
+   (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
+   (else  
+    (let* ((cmd-in            (vector-ref dat 0))
+           (cmd               (if (symbol? cmd-in)
+				  cmd-in
+				  (string->symbol cmd-in)))
+           (params            (vector-ref dat 1))
+	   (run-id            (if (null? params)
+				  0
+				  (car params)))
+	   (write-mutex       (if (hash-table-exists? *db-write-mutexes* run-id)
+				  (hash-table-ref *db-write-mutexes* run-id)
+				  (let* ((newmutex (make-mutex)))
+				    (hash-table-set! *db-write-mutexes* run-id newmutex)
+				    newmutex)))
+           (start-t           (current-milliseconds))
+           (readonly-mode     (dbr:dbstruct-read-only dbstruct))
+           (readonly-command  (member cmd api:read-only-queries))
+           (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))))
+      (if (not readonly-command)
+	  (mutex-lock! write-mutex))
+      (let* ((tmppath    (dbr:dbstruct-tmppath  dbstruct))
+	     (clean-run-id (cond
+			    ((number? run-id)   run-id)
+			    ((equal? run-id #f) "main")
+			    (else               "other")))
+	     (crumbfile  (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params)))
+	     (res    
+              (if writecmd-in-readonly-mode
+                  (conc "attempt to run write command "cmd" on a read-only database")
+		  (api:dispatch-request dbstruct cmd run-id params))))
+	(delete-file* crumbfile)
+	(if (not readonly-command)
+	    (mutex-unlock! write-mutex))
+	
+	;; save all stats
+	(let ((delta-t (- (current-milliseconds)
+			  start-t))
+	      (modified-cmd (if (eq? cmd 'general-call)
+				(string->symbol (conc "general-call-" (car params)))
+				cmd)))
+	  (hash-table-set! *db-api-call-time* modified-cmd
+			   (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '()))))
+	(if writecmd-in-readonly-mode
+            (begin
+              #;(common:telemetry-log (conc "api-out:"(->string cmd))
+              payload: `((params . ,params)
+              (ok-res . #t)))
+	      (vector #f res))
+            (begin
+              #;(common:telemetry-log (conc "api-out:"(->string cmd))
+              payload: `((params . ,params)
+              (ok-res . #f)))
+              (vector #t res))))))))
+
+;; indat is (cmd run-id params meta)
+;;
+;; WARNING: Do not print anything in the lambda of this function as it
+;;          reads/writes to current in/out port
+;;
+(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params)
+  (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
+  (if (not *server-signature*)
+      (set! *server-signature* (tt:mk-signature *toppath*)))
+  (lambda ()
+    (let* ((indat      (deserialize))
+	   (newcount   (+ *api-process-request-count* 1))
+	   (delay-wait (if (> newcount 10)
+			   (- newcount 10)
+			   0)))
+      (set! *api-process-request-count* newcount)
+      (set! *db-last-access* (current-seconds))
+      (match indat
+	((cmd run-id params meta)
+	 (let* ((status  (cond
+			  ((> newcount 30) 'busy)
+			  ((> newcount 15) 'loaded)
+			  (else 'ok)))
+		(errmsg  (case status
+			   ((busy)   (conc "Server overloaded, "newcount" threads in flight"))
+			   ((loaded) (conc "Server loaded, "newcount" threads in flight"))
+			   (else     #f)))
+		(result  (case status
+			   ((busy)  (- newcount 29))
+			   ((loaded) #f)
+			   (else
+			    (case cmd
+			      ((ping) *server-signature*)
+			      (else
+			       (api:dispatch-request dbstruct cmd run-id params))))))
+		(meta   `((wait . ,delay-wait)))
+		(payload (list status errmsg result meta)))
+	   (set! *api-process-request-count* (- *api-process-request-count* 1))
+	   (serialize payload)))
+	(else
+	 (assert #f "FATAL: failed to deserialize indat "indat))))))
+       
+
+(define (api:dispatch-request dbstruct cmd run-id params)
+  (db:open-no-sync-db)
+  (case cmd
+    ;;===============================================
+    ;; READ/WRITE QUERIES
+    ;;===============================================
+
+    ((get-keys-write)                        (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl
+    
+    ;; SERVERS
+    ((start-server)                    (apply server:kind-run params))
+    ((kill-server)                     (set! *server-run* #f))
+
+    ;; TESTS
+
+    ;;((test-set-state-status-by-id)     (apply mt:test-set-state-status-by-id dbstruct params))
+    ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items.
+    ((test-set-state-status-by-id)
+
+     ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
+     (db:set-state-status-and-roll-up-items
+      dbstruct
+      (list-ref params 0) ; run-id
+      (list-ref params 1) ; test-name
+      #f                  ; item-path
+      (list-ref params 2) ; state
+      (list-ref params 3) ; status
+      (list-ref params 4) ; comment
+      ))
+    
+    ((delete-test-records)             (apply db:delete-test-records dbstruct params))
+    ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params))
+    ((test-set-state-status)           (apply db:test-set-state-status dbstruct params))
+    ((test-set-top-process-pid)        (apply db:test-set-top-process-pid dbstruct params))
+    ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params))
+    ((set-state-status-and-roll-up-run) (apply db:set-state-status-and-roll-up-run dbstruct params)) 
+    ((top-test-set-per-pf-counts)      (apply db:top-test-set-per-pf-counts dbstruct params))
+    ((test-set-archive-block-id)       (apply db:test-set-archive-block-id dbstruct params))
+
+    ;; RUNS
+    ((register-run)                 (apply db:register-run dbstruct params))
+    ((set-tests-state-status)       (apply db:set-tests-state-status dbstruct params))
+    ((delete-run)                   (apply db:delete-run dbstruct params))
+    ((lock/unlock-run)              (apply db:lock/unlock-run dbstruct params))
+    ((update-run-event_time)        (apply db:update-run-event_time dbstruct params))
+    ((update-run-stats)             (apply db:update-run-stats dbstruct params))
+    ((set-var)                      (apply db:set-var dbstruct params))
+    ((inc-var)                      (apply db:inc-var dbstruct params))
+    ((dec-var)                      (apply db:dec-var dbstruct params))
+    ((del-var)                      (apply db:del-var dbstruct params))
+    ((add-var)                      (apply db:add-var dbstruct params))
+
+    ;; STEPS
+    ((teststep-set-status!)         (apply db:teststep-set-status! dbstruct params))
+    ((delete-steps-for-test!)       (apply db:delete-steps-for-test! dbstruct params))
+    
+    ;; TEST DATA
+    ((test-data-rollup)             (apply db:test-data-rollup dbstruct params))
+    ((csv->test-data)               (apply db:csv->test-data dbstruct params))
+
+    ;; MISC
+    ((sync-inmem->db)               (let ((run-id (car params)))
+                                      (db:sync-touched dbstruct run-id db:initialize-main-db force-sync: #t)))
+    ((mark-incomplete)              (apply db:find-and-mark-incomplete dbstruct params))
+    ((create-all-triggers)          (db:create-all-triggers dbstruct))
+    ((drop-all-triggers)            (db:drop-all-triggers dbstruct)) 
+
+    ;; TESTMETA
+    ((testmeta-add-record)       (apply db:testmeta-add-record dbstruct params))
+    ((testmeta-update-field)     (apply db:testmeta-update-field dbstruct params))
+    ((get-tests-tags)            (db:get-tests-tags dbstruct))
+
+    ;; TASKS
+    ((tasks-add)                 (apply tasks:add dbstruct params))   
+    ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params))
+    ((tasks-get-last)            (apply tasks:get-last dbstruct params))
+
+    ;; NO SYNC DB
+    ((no-sync-set)               (apply db:no-sync-set         *no-sync-db* params))
+    ((no-sync-get/default)       (apply db:no-sync-get/default *no-sync-db* params))
+    ((no-sync-del!)              (apply db:no-sync-del!        *no-sync-db* params))
+    ((no-sync-get-lock)          (apply db:no-sync-get-lock    *no-sync-db* params))
+    
+    ;; ARCHIVES
+    ;; ((archive-get-allocations)   
+    ((archive-register-disk)     (apply db:archive-register-disk dbstruct params))
+    ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params))
+    ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey))
+
+    ;;======================================================================
+    ;; READ ONLY QUERIES
+    ;;======================================================================
+
+    ;; KEYS
+    ((get-key-val-pairs)               (apply db:get-key-val-pairs dbstruct params))
+    ((get-keys)                        (db:get-keys dbstruct))
+    ((get-key-vals)                    (apply db:get-key-vals dbstruct params))
+    ((get-target)                      (apply db:get-target dbstruct params))
+    ((get-targets)                     (db:get-targets dbstruct))
+
+    ;; ARCHIVES
+    ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
+    
+    ;; TESTS
+    ((test-toplevel-num-items)         (apply db:test-toplevel-num-items dbstruct params))
+    ((get-test-info-by-id)	       (apply db:get-test-info-by-id dbstruct params))
+    ((get-test-state-status-by-id)     (apply db:get-test-state-status-by-id dbstruct params))
+    ((test-get-rundir-from-test-id)    (apply db:test-get-rundir-from-test-id dbstruct params))
+    ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params))
+    ((get-count-tests-running)         (apply db:get-count-tests-running dbstruct params))
+    ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params))
+    ;; ((delete-test-step-records)        (apply db:delete-test-step-records dbstruct params))
+    ;; ((get-previous-test-run-record)    (apply db:get-previous-test-run-record dbstruct params))
+    ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params))
+    ((test-get-logfile-info)           (apply db:test-get-logfile-info dbstruct params))
+    ((test-get-records-for-index-file)  (apply db:test-get-records-for-index-file dbstruct params))
+    ((get-testinfo-state-status)       (apply db:get-testinfo-state-status dbstruct params))
+    ((test-get-top-process-pid)        (apply db:test-get-top-process-pid dbstruct params))
+    ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params))
+    ((get-prereqs-not-met)             (apply db:get-prereqs-not-met dbstruct params))
+    ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params))
+    ((get-not-completed-cnt)           (apply db:get-not-completed-cnt  dbstruct params)) 
+    ;; ((synchash-get)                    (apply synchash:server-get dbstruct params))
+    ((get-raw-run-stats)               (apply db:get-raw-run-stats dbstruct params))
+    ((get-test-times)                  (apply db:get-test-times dbstruct params))
+
+    ;; RUNS
+    ((get-run-info)                 (apply db:get-run-info dbstruct params))
+    ((get-run-status)               (apply db:get-run-status dbstruct params))
+    ((get-run-state)                (apply db:get-run-state dbstruct params))
+    ((get-run-state-status)         (apply db:get-run-state-status dbstruct params))
+    ((set-run-status)               (apply db:set-run-status dbstruct params))
+    ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
+    ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db  dbstruct params)) 
+    ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
+    ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params))
+    ((get-test-id)                  (apply db:get-test-id dbstruct params))
+    ((get-tests-for-run-mindata)    (apply db:get-tests-for-run-mindata dbstruct params))
+    ;; ((get-tests-for-runs-mindata)   (apply db:get-tests-for-runs-mindata dbstruct params))
+    ((get-runs)                     (apply db:get-runs dbstruct params))
+    ((simple-get-runs)              (apply db:simple-get-runs dbstruct params))
+    ((get-num-runs)                 (apply db:get-num-runs dbstruct params))
+    ((get-runs-cnt-by-patt)         (apply db:get-runs-cnt-by-patt dbstruct params))
+    ((get-all-run-ids)              (db:get-all-run-ids dbstruct))
+    ((get-prev-run-ids)             (apply db:get-prev-run-ids dbstruct params))
+    ((get-run-ids-matching-target)  (apply db:get-run-ids-matching-target dbstruct params))
+    ((get-runs-by-patt)             (apply db:get-runs-by-patt dbstruct params))
+    ((get-run-name-from-id)         (apply db:get-run-name-from-id dbstruct params))
+    ((get-main-run-stats)           (apply db:get-main-run-stats dbstruct params))
+    ((get-var)                      (apply db:get-var dbstruct params))
+    ((get-run-stats)                (apply db:get-run-stats dbstruct params))
+    ((get-run-times)                (apply db:get-run-times dbstruct params)) 
+
+    ;; STEPS
+    ((get-steps-data)               (apply db:get-steps-data dbstruct params))
+    ((get-steps-for-test)           (apply db:get-steps-for-test dbstruct params))
+    ((get-steps-info-by-id)         (apply db:get-steps-info-by-id dbstruct params))
+
+    ;; TEST DATA
+    ((read-test-data)               (apply db:read-test-data dbstruct params))
+    ((read-test-data-varpatt)       (apply db:read-test-data-varpatt dbstruct params))
+    ((get-data-info-by-id)          (apply db:get-data-info-by-id dbstruct params)) 
+
+    ;; MISC
+    ((get-latest-host-load)         (apply db:get-latest-host-load dbstruct params))
+    ((have-incompletes?)            (apply db:have-incompletes? dbstruct params))
+    ((login)                        (apply db:login dbstruct params))
+    ((general-call)                 (let ((stmtname   (car params))
+                                          (run-id     (cadr params))
+                                          (realparams (cddr params)))
+                                      (db:general-call dbstruct run-id stmtname realparams)))
+    ((sdb-qry)                      (apply sdb:qry params))
+    ((ping)                         (current-process-id))
+    ((get-changed-record-ids)       (apply db:get-changed-record-ids dbstruct params))
+    ((get-run-record-ids) 	   (apply db:get-run-record-ids dbstruct params))	
+    ;; TESTMETA
+    ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))
+
+    ;; TASKS 
+    ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
+    (else
+     (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
+     (conc "ERROR: BAD api call " cmd))))
 
 ;; http-server  send-response
 ;;                 api:process-request
 ;;                    db:*
 ;;

Index: archive.scm
==================================================================
--- archive.scm
+++ archive.scm
@@ -21,10 +21,12 @@
 (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 format md5 message-digest srfi-18)
 
 (declare (unit archive))
 (declare (uses db))
 (declare (uses common))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 (include "db_records.scm")
 
 ;;======================================================================

ADDED   artifacts.scm
Index: artifacts.scm
==================================================================
--- /dev/null
+++ artifacts.scm
@@ -0,0 +1,24 @@
+;;======================================================================
+;; Copyright 2019, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+;;======================================================================
+
+(declare (unit artifacts))
+
+(include "artifacts/artifacts.scm")
+

ADDED   artifacts/README
Index: artifacts/README
==================================================================
--- /dev/null
+++ artifacts/README
@@ -0,0 +1,1 @@
+NOTE: keep megatest/artifacts/ in sync with datastore/artifacts

ADDED   artifacts/artifacts.meta
Index: artifacts/artifacts.meta
==================================================================
--- /dev/null
+++ artifacts/artifacts.meta
@@ -0,0 +1,21 @@
+;; -*- scheme -*-
+(
+; Your egg's license:
+(license "BSD")
+
+; Pick one from the list of categories (see below) for your egg and enter it
+; here.
+(category db)
+
+; A list of eggs pkts depends on.  If none, you can omit this declaration
+; altogether. If you are making an egg for chicken 3 and you need to use
+; procedures from the `files' unit, be sure to include the `files' egg in the
+; `needs' section (chicken versions < 3.4.0 don't provide the `files' unit).
+; `depends' is an alias to `needs'.
+;; (needs (autoload "3.0"))
+
+; A list of eggs required for TESTING ONLY.  See the `Tests' section.
+(test-depends test)
+
+(author "Matt Welland")
+(synopsis "A sha1-chain based datastore similar to the data format in fossil scm, consisting of artifacts of single line cards."))

ADDED   artifacts/artifacts.release-info
Index: artifacts/artifacts.release-info
==================================================================
--- /dev/null
+++ artifacts/artifacts.release-info
@@ -0,0 +1,3 @@
+(repo fossil "http://www.kiatoa.com/cgi-bin/fossils/{egg-name}")
+(uri zip "http://www.kiatoa.com/cgi-bin/fossils/{egg-name}/zip/{egg-name}.zip?uuid={egg-release}")
+(release "1.0")

ADDED   artifacts/artifacts.scm
Index: artifacts/artifacts.scm
==================================================================
--- /dev/null
+++ artifacts/artifacts.scm
@@ -0,0 +1,1624 @@
+;; Copyright 2006-2017, Matthew Welland.
+;; 
+;; This file is part of artifacts
+;; 
+;;     Pkts is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Pkts is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Pkts.  If not, see <http://www.gnu.org/licenses/>.
+;;
+
+;; CARDS:
+;;
+;; A card is a line of text, the first two characters are a letter followed by a
+;;   space. The letter is the card type.
+;;
+;; artifact:
+;;
+;; An artifact is a sorted list of cards with a final card Z that contains the shar1 hash
+;;   of all of the preceding cards.
+;;
+;; AARTIFACT:
+;;
+;;  An alist mapping card types to card data
+;;      '((T . "artifacttype")
+;;        (a . "some content"))
+;;
+;; EARTIFACT:
+;;
+;;  Extended packet using friendly keys. Must use a artifactspec to convert to/from eartifacts
+;;    '((ptype . "artifacttype")
+;;      (adata . "some content))
+;;
+;; DARTIFACT:
+;;
+;; artifacts pulled from the database have this format:
+;;
+;;((aartifact (Z . "7de89c09ac024b3832c93e16cd78d11e2e28733b")     <= this is a the alist
+;;       (t . "v1.63/tip/dev")
+;;       (c . "QUICKPATT")
+;;       (T . "runstart")
+;;       (P . "354eeb67120a921e3e33155ecab1b522ac10b6bd")
+;;       (D . "1488995096.0"))
+;;  (id . 8)
+;;  (group-id . 0)
+;;  (uuid . "7de89c09ac024b3832c93e16cd78d11e2e28733b")
+;;  (parent . "")
+;;  (artifact-type . "runstart")
+;;  (artifact . "D 1488995096.0\nP 354eeb67120a921e3e33155ecab1b522ac10b6bd\nT runstart\nc QUICKPATT\nt v1.63/tip/dev\nZ 7de89c09ac024b3832c93e16cd78d11e2e28733b"))
+;;
+;; artifactspec is alist of alists mapping types and nicekeys to keys
+;;
+;; '((posting . ((title . t)
+;;               (url   . u)
+;;               (blurb . b)))
+;;   (comment . ((comment . c)
+;;               (score   . s))))
+
+;; Reserved cards:
+;;   P      : artifact parent
+;;   R      : reference artifact containing mapping of short string -> sha1sum strings
+;;   T      : artifact type
+;;   D      : current time from (current-time), unless provided
+;;   Z      : shar1 hash of the packet
+
+;; Example usage:
+;;
+;; Create a artifact:
+;;
+;; (use artifacts)
+;; (define-values (uuid artifact)
+;;     (alist->artifact
+;;       '((fruit . "apple") (meat . "beef"))  ;; this is the data to convert
+;;       '((foods (fruit . f) (meat . m)))     ;; this is the artifact spec
+;;       ptype:
+;;       'foods))
+;;
+;; Add to artifact queue:
+;;
+;; (define db (open-queue-db "/tmp/artifacts" "artifacts.db"))
+;; (add-to-queue db artifact uuid 'foods #f 0) ;; no parent and use group_id of 0
+;;
+;; Retrieve the packet from the db and extract a value:
+;;
+;; (alist-ref
+;;    'meat
+;;    (dartifact->alist
+;;         (car (get-dartifacts db #f 0 #f))
+;;        '((foods (fruit . f)
+;;                 (meat . m)))))
+;; => "beef"
+;;
+
+(module artifacts
+(
+;; cards, util and misc
+;; sort-cards
+;; calc-sha1
+;;
+;; low-level constructor procs, exposed only for development/testing, will be removed
+construct-sdat
+construct-artifact     
+card->type/value  
+add-z-card
+
+;; queue database procs
+open-queue-db
+add-to-queue
+create-and-queue
+;; lookup-by-uuid
+lookup-by-id
+get-dartifacts
+get-not-processed-artifacts
+get-related
+find-artifacts
+process-artifacts
+get-descendents
+get-ancestors
+get-artifacts
+;; get-last-descendent
+;; with-queue-db
+;; load-artifacts-to-db
+
+;; procs that operate directly on artifacts, sdat, aartifacts, dartifacts etc.
+artifact->alist    ;; artifact -> aartifact (i.e. alist)
+artifact->sdat     ;; artifact -> '("a aval" "b bval" ...)
+sdat->alist   ;; '("a aval" "b bval"...) -> ((a . "aval")(b . "bval") ...)
+dblst->dartifacts  ;; convert list of tuples from queue db into dartifacts
+dartifact->alist   ;; flatten a dartifact into an alist containing all db fields and the artifact alist
+dartifacts->alists ;; apply dartifact->alist to a list of alists using a artifact-spec
+alist->artifact    ;; returns two values uuid, artifact
+get-value     ;; looks up a value given a key in a dartifact
+flatten-all   ;; merge the list of values from a query which includes a artifact into a flat alist <== really useful!
+check-artifact
+
+;; artifact alists
+write-alist->artifact
+read-artifact->alist
+
+;; archive database
+;; archive-open-db
+;; write-archive-artifacts
+;; archive-artifacts
+;; mark-processed
+
+;; artifactsdb
+artifactdb-conn     ;; useful
+artifactdb-fname
+artifactsdb-open
+artifactsdb-close
+artifactsdb-add-record
+;; temporary
+artifactdb-artifactspec
+
+;; utility procs
+increment-string ;; used to get indexes for strings in ref artifacts
+make-report      ;; make a .dot file
+calc-sha1
+uuid-first-two-letters
+uuid-remaining-letters
+
+;; file and directory utils
+multi-glob
+capture-dir
+file-get-sha1
+check-same
+link-or-copy
+same-partition?
+link-if-same-partition
+archive-copy
+write-to-archive
+artifact-rollup
+read-artifacts-into-hash
+hash-of-artifacts->bundle
+archive-dest
+
+;; pathname-full-filename
+
+;; minimal artifact functions
+minimal-artifact-read
+minimal-artifact->alist
+afact-get-D
+afact-get-Z
+afact-get-T
+afact-get
+afact-get-number/default
+
+
+;; bundles
+write-bundle
+read-bundle
+
+;; new artifacts db
+with-todays-adb
+get-all-artifacts
+refresh-artifacts-db
+
+)
+
+(import (chicken base) scheme (chicken process) (chicken time posix)
+	(chicken io) (chicken file) (chicken pathname)
+        chicken.process-context.posix (chicken string)
+	(chicken time) (chicken sort) (chicken file posix) (chicken condition) srfi-1
+	regex srfi-13 srfi-69 (chicken port) (chicken process-context)
+	crypt sha1 matchable message-digest sqlite3 typed-records
+	directory-utils
+	scsh-process)
+
+;;======================================================================
+;; DATA MANIPULATION UTILS
+;;======================================================================
+
+(define-inline (unescape-data data)
+  (string-translate* data '(("\\n" . "\n") ("\\\\" . "\\"))))
+
+(define-inline (escape-data data)
+  (string-translate* data '(("\n" . "\\n") ("\\" . "\\\\"))))
+
+(define-inline (make-card type data)
+  (conc type " " (escape-data (->string data))))
+
+;; reverse an alist for doing artifactkey -> external key conversions
+;;
+(define-inline (reverse-aspec aspec)
+  (map (lambda (dat)
+	 (cons (cdr dat)(car dat)))
+       aspec))
+
+;; add a card to the list of cards, sdat
+;; if type is #f return only sdat
+;; if data is #f return only sdat
+;;
+(define-inline (add-card sdat type data)
+  (if (and type data)
+      (cons (make-card type data) sdat)
+      sdat))
+
+;;======================================================================
+;; STRING AS FUNKY NUMBER
+;;======================================================================
+
+;; NOTE: PTDZ are removed as they are reserved. NB// the R card is not used in a
+;;       ref, instead the P parent card is used.
+;;       Question: Why does it matter to remove PTDZ?
+;;                 To make the ref easier to use the ref strings will be the keys
+;;                 so we cannot have overlap with any actual keys. But this is a
+;;                 bit silly. What we need to do instead is reject keys of length
+;;                 one where the char is in PTDZ
+;;
+;; This is basically base92
+;;
+(define string-num-chars (string->list "!#$%&'()*+,-./0123456789:;<=>?@ABCEFGHIJKLMNOQRSUVWXY[\\]^_abcdefghijklmnopqrstuvwxyz{|}~"))
+;; "0123456789abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOQSUVWXY!#$%&'()*+,-./[]:;<=>?\\^_{}|"))
+
+(define (char-incr inchar)
+  (let* ((carry     #f)
+	 (next-char (let ((rem (member inchar string-num-chars)))
+		      (if (eq? (length rem) 1) ;; we are at the last character in our string-num-chars list
+			  (begin
+			    (set! carry #t)
+			    (car string-num-chars))
+			  (cadr rem)))))
+    (values next-char carry)))
+    
+(define (increment-string str)
+  (if (string-null? str)
+      "0"
+      (let ((strlst (reverse (string->list str)))) ;; need to process the string from the lsd
+	(list->string
+	 (let loop ((hed (car strlst))
+		    (tal (cdr strlst))
+		    (res '()))
+	   (let-values (((newhed carry)(char-incr hed)))
+	     ;; (print "newhed: " newhed " carry: " carry " tal: " tal)
+	     (let ((newres (cons newhed res)))
+	       (if carry ;; we'll have to propagate the carry
+		   (if (null? tal) ;; at the end, tack on "0" (which is really a "1")
+		       (cons (car string-num-chars) newres)
+		       (loop (car tal)(cdr tal) newres))
+		   (append (reverse tal) newres)))))))))
+    
+;;======================================================================
+;; P K T S D B   I N T E R F A C E
+;;
+;; INTEGER, REAL, TEXT
+;;======================================================================
+;;
+;; spec
+;;  ( (tablename1 . (field1name L1 TYPE)
+;;                  (field2name L2 TYPE) ... )
+;;    (tablename2 ... ))
+;;
+;;  Example: (tests (testname n TEXT)
+;;                  (rundir   r TEXT)
+;;                   ... )
+;;
+;; artifact keys are taken from the first letter, if that is not unique
+;; then look at the next letter and so on
+;;
+
+;; simplify frequent need to get one result with default
+;;
+(define (get-one db default qry . params)
+  (apply fold-row
+   car
+   default
+   db
+   qry
+   params))
+
+(define (get-rows db qry . params)
+  (apply fold-row
+   cons
+   db
+   qry
+   params))
+  
+;; use this struct to hold the artifactspec and the db handle
+;;
+(defstruct artifactdb
+  (fname       #f)
+  (artifactsdb-spec #f)
+  (artifactspec     #f)  ;; cache the artifactspec
+  (field-keys  #f)  ;; cache the field->key mapping (field1 . k1) ...
+  (key-fields  #f)  ;; cache the key->field mapping
+  (conn        #f)
+  )
+
+;; WARNING: There is a simplification in the artifactsdb spec w.r.t. artifactspec.
+;;          The field specs are the cdr of the table list - not a full
+;;          list. The extra list level in artifactspec is gratuitous and should
+;;          be removed.
+;;
+(define (artifactsdb-spec->artifactspec tables-spec)
+  (map (lambda (tablespec)
+	 (list (car tablespec)
+	       (map (lambda (field-spec)
+		      (cons (car field-spec)(cadr field-spec)))
+		    (cdr tablespec))))
+       tables-spec))
+
+(define (artifactsdb-open dbfname artifactsdb-spec)
+  (let* ((pdb      (make-artifactdb))
+	 (dbexists (file-exists? dbfname))
+	 (db       (open-database dbfname)))
+    (artifactdb-artifactsdb-spec-set! pdb artifactsdb-spec)
+    (artifactdb-artifactspec-set!     pdb (artifactsdb-spec->artifactspec artifactsdb-spec))
+    (artifactdb-fname-set!       pdb dbfname)
+    (artifactdb-conn-set!        pdb db)
+    (if (not dbexists)
+	(artifactsdb-init pdb))
+    pdb))
+
+(define (artifactsdb-init artifactsdb)
+  (let* ((db          (artifactdb-conn artifactsdb))
+	 (artifactsdb-spec (artifactdb-artifactsdb-spec artifactsdb)))
+    ;; create a table for the artifacts themselves
+    (execute db "CREATE TABLE IF NOT EXISTS artifacts (id INTEGER PRIMARY KEY, zkey TEXT, record_id INTEGER, artifact TEXT);")
+    (for-each
+     (lambda (table)
+       (let* ((table-name (car table))
+	      (fields     (cdr table))
+	      (stmt (conc "CREATE TABLE IF NOT EXISTS "
+			  table-name
+			  " (id INTEGER PRIMARY KEY,"
+			  (string-intersperse
+			   (map (lambda (fieldspec)
+				  (conc (car fieldspec) " "
+					(caddr fieldspec)))
+				fields)
+			   ",")
+			  ");")))
+	 (execute db stmt)))
+     artifactsdb-spec)))
+
+;; create artifact from the data and insert into artifacts table
+;; 
+;; data is assoc list of (field . value) ...
+;; tablename is a symbol matching the table name
+;;
+(define (artifactsdb-add-record artifactsdb tablename data #!optional (parent #f))
+  (let*-values (((zkey artifact) (alist->artifact data (artifactdb-artifactspec artifactsdb) ptype: tablename)))
+    ;; have the data as alist so insert it into appropriate table also
+    (let* ((db        (artifactdb-conn artifactsdb)))
+      ;; TODO: Address collisions
+      (execute db "INSERT INTO artifacts (zkey,artifact,record_id) VALUES (?,?,?);"
+		zkey artifact -1)
+      (let* (;; (artifactid     (artifactsdb-artifactkey->artifactid artifactsdb artifactkey))
+	     (record-id (artifactsdb-insert artifactsdb tablename data)))
+	(execute db "UPDATE artifacts SET record_id=? WHERE zkey=?;"
+		  record-id zkey)
+      ))))
+
+;; 
+(define (artifactsdb-insert artifactsdb tablename data)
+  (let* ((db (artifactdb-conn artifactsdb))
+	 (stmt (conc "INSERT INTO " tablename
+		     " (" (string-intersperse (map conc (map car data)) ",")
+		     ") VALUES ('"
+		     ;; TODO: Add lookup of data type and do not
+		     ;;       wrap integers with quotes
+		     (string-intersperse (map conc (map cdr data)) "','")
+		     "');")))
+    (print "stmt: " stmt)
+    (execute db stmt)
+    ;; lookup the record-id and return it
+    
+    ))
+    
+(define (artifactsdb-close artifactsdb)
+  (finalize! (artifactdb-conn artifactsdb)))
+
+;; (let loop ((s "0")(n 0))(print s)(if (< n 5000)(loop (increment-string s)(+ n 1))))
+
+;;======================================================================
+;; CARDS, MISC and UTIL
+;;======================================================================
+
+;; given string (likely multi-line) "dat" return shar1 hash
+;;
+(define (calc-sha1 instr)
+  (message-digest-string
+   (sha1-primitive)
+   instr))
+
+;; given a single card return its type and value
+;;
+(define (card->type/value card)
+  (let ((ctype (substring card 0 1))
+	(cval  (substring card 2 (string-length card))))
+    (values (string->symbol ctype) cval)))
+
+;;======================================================================
+;; SDAT procs
+;;  sdat is legacy/internal usage. Intention is to remove sdat calls from
+;;  the exposed calls.
+;;======================================================================
+
+;; sort list of cards
+;;
+(define-inline (sort-cards sdat)
+  (sort sdat string<=?))
+
+;; artifact rules
+;; 1. one card per line
+;; 2. at least one card
+;; 3. no blank lines
+
+;; given sdat, a list of cards return uuid, packet (as sdat)
+;;
+(define (add-z-card sdat)
+  (let* ((sorted-sdat (sort-cards sdat))
+	 (dat         (string-intersperse sorted-sdat "\n"))
+	 (uuid        (calc-sha1 dat)))
+    (values
+     uuid
+     (conc
+      dat
+      "\nZ "
+      uuid))))
+
+(define (check-artifact artifact)
+  (handle-exceptions
+      exn
+      #f ;; anything goes wrong - call it a crappy artifact
+    (let* ((sdat (string-split artifact "\n"))
+	   (rdat (reverse sdat)) ;; reversed
+	   (zdat (car rdat))
+	   (Z    (cadr (string-split zdat)))
+	   (cdat (string-intersperse (reverse (cdr rdat)) "\n")))
+      (equal? Z (calc-sha1 cdat)))))
+
+;;======================================================================
+;; AARTIFACTs
+;;======================================================================
+
+;; convert a sdat (list of cards) to an alist
+;;
+(define (sdat->alist sdat)
+  (let loop ((hed (car sdat))
+	     (tal (cdr sdat))
+	     (res '()))
+    (let-values (( (ctype cval)(card->type/value hed) ))
+      ;; if this card is not one of the common ones tack it on to rem
+      (let* ((oldval (alist-ref ctype res))
+	     (newres (cons (cons ctype
+				 (if oldval ;; list or string
+				     (if (list? oldval)
+					 (cons cval oldval)
+					 (cons cval (list oldval)))
+				     cval))
+			   res)))
+	(if (null? tal)
+	    newres
+	    (loop (car tal)(cdr tal) newres))))))
+
+;;((aartifact (Z . "7de89c09ac024b3832c93e16cd78d11e2e28733b")     <= this is a the alist
+;;       (t . "v1.63/tip/dev")
+;;       (c . "QUICKPATT")
+;;       (T . "runstart")
+;;       (P . "354eeb67120a921e3e33155ecab1b522ac10b6bd")
+;;       (D . "1488995096.0"))
+;;  (id . 8)
+;;  (group-id . 0)
+;;  (uuid . "7de89c09ac024b3832c93e16cd78d11e2e28733b")
+;;  (parent . "")
+;;  (artifact-type . "runstart")
+;;  (artifact . "D 1488995096.0\nP 354eeb67120a921e3e33155ecab1b522ac10b6bd\nT runstart\nc QUICKPATT\nt v1.63/tip/dev\nZ 7de89c09ac024b3832c93e16cd78d11e2e28733b"))
+;;
+;; artifactspec is alist of alists mapping types and nicekeys to keys
+;;
+;; '((posting . ((title . t)
+;;               (url   . u)
+;;               (blurb . b)))
+;;   (comment . ((comment . c)
+;;               (score   . s))))
+
+;; DON'T USE? 
+;;
+(define (get-value field dartifact . spec-in)
+  (if (null? spec-in)
+      (alist-ref field dartifact)
+      (let* ((spec  (car spec-in))
+	     (aartifact  (alist-ref 'aartifact dartifact))) ;; get the artifact alist
+	(if (and aartifact spec)
+	    (let* ((ptype (alist-ref 'artifact-type dartifact))
+		   (pspec (alist-ref (string->symbol ptype) spec))) ;; do we have a spec for this type of artifact
+	      (and pspec
+		  (let* ((key (alist-ref field pspec)))
+		    (and key (alist-ref key aartifact)))))
+	    #f))))
+
+;; convert a dartifact to a pure alist given a artifactspec
+;; this flattens out the alist to include the data from
+;; the queue database record
+;;
+(define (dartifact->alist dartifact artifactspec)
+  (let* ((aartifact       (alist-ref 'aartifact dartifact))
+	 (artifact-type   (or (alist-ref 'artifact-type dartifact) ;; artifact-type is from the database field artifact_type
+			 (alist-ref 'T aartifact)))
+	 (artifact-fields (alist-ref (string->symbol artifact-type) artifactspec))
+	 (rev-fields (if artifact-fields
+			 (reverse-aspec artifact-fields)
+			 '())))
+    (append (map (lambda (entry)
+		   (let* ((artifact-key (car entry))
+			  (new-key (or (alist-ref artifact-key rev-fields) artifact-key)))
+		     `(,new-key . ,(cdr entry))))
+		 aartifact)
+	    dartifact)))
+
+;; convert a list of dartifacts into a list of alists using artifact-spec
+;;
+(define (dartifacts->alists dartifacts artifact-spec)
+   (map (lambda (x)
+	  (dartifact->alist x artifact-spec))
+	dartifacts))
+
+;; Generic flattener, make the tuple and artifact into a single flat alist
+;;
+;; qry-result-spec is a list of symbols corresponding to each field
+;;
+(define (flatten-all inlst artifactspec . qry-result-spec)
+  (map
+   (lambda (tuple)
+     (dartifact->alist
+      (apply dblst->dartifacts tuple qry-result-spec)
+      artifactspec))
+   inlst))
+
+;; call like this:
+;;  (construct-sdat 'a "a data" 'S "S data" ...)
+;; returns list of cards
+;;  ( "A a value" "D 12345678900" ...)
+;;
+(define (construct-sdat . alldat)
+  (let ((have-D-card #f)) ;; flag
+    (if (even? (length alldat))
+	(let loop ((type (car alldat))
+		   (data (cadr alldat))
+		   (tail (cddr alldat))
+		   (res  '()))
+	  (if (eq? type 'D)(set! have-D-card #t))
+	  (if (null? tail)
+	      (if have-D-card ;; return the constructed artifact, add a D card if none found
+		  (add-card res type data)
+		  (add-card 
+		   (add-card res 'D (current-seconds))
+		   type data))
+	      (loop (car tail)
+		    (cadr tail)
+		    (cddr tail)
+		    (add-card res type data))))
+	#f))) ;; #f means it failed to create the sdat
+
+(define (construct-artifact . alldat)
+  (add-z-card
+   (apply construct-sdat alldat)))
+
+;;======================================================================
+;; CONVERTERS
+;;======================================================================
+
+(define (artifact->sdat artifact)
+  (map unescape-data (string-split artifact "\n")))
+
+;; given a pure artifact return an alist
+;;
+(define (artifact->alist artifact #!key (artifactspec #f))
+  (let ((sdat (cond
+	       ((string? artifact)  (artifact->sdat artifact))
+	       ((list? artifact)    artifact)
+	       (else #f))))
+    (if artifact
+	(if artifactspec
+	    (dartifact->alist (list (cons 'aartifact (sdat->alist sdat))) artifactspec)
+	    (sdat->alist sdat))
+	#f)))
+
+;; convert an alist to an sdat
+;;  in: '((a . "blah")(b . "foo"))
+;; out: '("a blah" "b foo")
+;;
+(define (alist->sdat adat)
+  (map (lambda (dat)
+	 (conc (car dat) " " (cdr dat)))
+       adat))
+
+;; adat is the incoming alist, aspec is the mapping
+;; from incoming key to the artifact key (usually one
+;; letter to keep data tight) see the artifactspec at the
+;; top of this file
+;;
+;; NOTE: alists can contain multiple instances of the same key (supported fine by artifacts)
+;;       but you (obviously I suppose) cannot use alist-ref to access those entries.
+;;
+(define (alist->artifact adat aspec #!key (ptype #f)(no-d #f))
+  (let* ((artifact-type (or ptype
+			    (alist-ref 'T adat) ;; can provide in the incoming alist
+			    #f))
+	 (artifact-spec (if artifact-type            ;; alist of external-key -> key
+			    (or (alist-ref artifact-type aspec) '())
+			    (if (null? aspec)
+				'()
+				(cdar aspec)))) ;; default to first one if nothing specified
+	 (new-alist (map (lambda (dat)
+			   (let* ((key    (car dat))
+				  (val    (cdr dat))
+				  (newkey (or (alist-ref key artifact-spec)
+					      key)))
+			     (cons newkey (escape-data (conc val))))) ;; convert all incoming data (symbols, numbers etc.) to a string and then escape newlines.
+			 adat))
+	 (new-with-type (if (alist-ref 'T new-alist)
+			    new-alist
+			    (cons `(T . ,artifact-type) new-alist)))
+	 (with-d-card   (if (or no-d ;; no timestamp wanted
+				(alist-ref 'D new-with-type))
+			    new-with-type
+			    (cons `(D . ,(current-seconds))
+				  new-with-type))))
+    (add-z-card
+     (alist->sdat with-d-card))))
+
+;;======================================================================
+;;  D B   Q U E U E   I N T E R F A C E
+;;======================================================================
+
+;; artifacts (
+;;   id SERIAL PRIMARY KEY,
+;;   uuid TEXT NOT NULL,
+;;   parent_uuid TEXT default '',
+;;   artifact_type INTEGER DEFAULT 0,
+;;   group_id INTEGER NOT NULL,
+;;   artifact TEXT NOT NULL
+
+;; schema is list of SQL statements - can be used to extend db with more tables
+;;
+(define (open-queue-db dbpath dbfile #!key (schema '()))
+  (let* ((dbfname  (conc dbpath "/" dbfile))
+	 (dbexists (if (file-exists? dbfname) #t (begin (create-directory dbpath #t) #f)))
+	 (db       (open-database dbfname)))
+    ;; (set-busy-handler! (dbi:db-conn db) (busy-timeout 10000))
+    (if (not dbexists) ;; NOTE: In the archive we allow duplicates and other messiness. 
+	(for-each
+	 (lambda (stmt)
+	   (execute db stmt))
+	 (cons "CREATE TABLE IF NOT EXISTS artifacts
+                          (id           INTEGER PRIMARY KEY,
+                           group_id     INTEGER NOT NULL,
+                           uuid         TEXT NOT NULL,
+                           parent_uuid  TEXT TEXT DEFAULT '',
+                           artifact_type     TEXT NOT NULL,
+                           artifact          TEXT NOT NULL,
+                           processed    INTEGER DEFAULT 0)"
+		   schema))) ;; 0=not processed, 1=processed, 2... for expansion
+    db))
+
+(define (add-to-queue db artifact uuid artifact-type parent-uuid group-id)
+  (execute db "INSERT INTO artifacts (uuid,parent_uuid,artifact_type,artifact,group_id)
+                   VALUES(?,?,?,?,?);" ;; $1,$2,$3,$4,$5);"
+	    uuid
+	    (if parent-uuid parent-uuid "");; use null string as placeholder for no parent uuid.
+	    (if artifact-type (conc artifact-type) "") 
+	    artifact
+	    group-id))
+
+;; given all needed parameters create a artifact and store it in the queue
+;;  procs is an alist that maps artifact-type to a function that takes a list of artifact params
+;;  in data and returns the uuid and artifact
+;;
+(define (create-and-queue conn procs artifact-type parent-uuid group-id data)
+  (let ((proc (alist-ref artifact-type procs)))
+    (if proc
+	(let-values (( (uuid artifact) (proc data) ))
+	  (add-to-queue conn artifact uuid artifact-type parent-uuid group-id)
+	  uuid)
+	#f)))
+
+;; given uuid get artifact, if group-id is specified use it (reduces probablity of
+;;     being messed up by a uuid collision)
+;;
+(define (lookup-by-uuid db artifact-uuid group-id)
+  (if group-id
+      (get-one db "SELECT artifact FROM artifacts WHERE group_id=? AND uuid=?;" group-id artifact-uuid)
+      (get-one db "SELECT artifact FROM artifacts WHERE uuid=?;" artifact-uuid)))
+      
+;; find a packet by its id
+;;
+(define (lookup-by-id db id)
+  (get-one db "SELECT artifact FROM artifacts WHERE id=?;" id))
+
+
+;;======================================================================
+;;  P R O C E S S   P K T S
+;;======================================================================
+
+;; given a list of field values pulled from the queue db generate a list
+;; of dartifact's
+;;
+(define (dblst->dartifacts lst . altmap)
+  (let* ((maplst (if (null? altmap)
+		     '(id group-id uuid parent artifact-type artifact processed)
+		     altmap))
+	 (res (map cons maplst lst))) ;; produces list of pairs, i.e an alist
+    (cons `(aartifact . ,(artifact->alist (alist-ref 'artifact res)))
+	  res)))
+
+;; NB// ptypes is a list of symbols, '() or #f find all types
+;;
+(define (get-dartifacts db ptypes group-id parent-uuid #!key (uuid #f))
+  (let* ((ptype-qry (if (and ptypes
+			     (not (null? ptypes)))
+			(conc " IN ('" (string-intersperse (map conc ptypes) "','") "')")
+			(conc " LIKE '%' ")))
+	 (rows      (get-rows
+		     db
+		     (conc
+		      "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed FROM artifacts
+                         WHERE artifact_type " ptype-qry " AND group_id=?
+                         AND processed=0 "
+			 (if parent-uuid (conc "AND parent_uuid='" parent-uuid "' ") "")
+			 (if uuid        (conc "AND        uuid='"        uuid "' ") "")
+			 "ORDER BY id DESC;")
+		     group-id)))
+    (map dblst->dartifacts (map vector->list rows))))
+
+;; get N artifacts not yet processed for group-id
+;;
+(define (get-not-processed-artifacts db group-id artifact-type limit offset)
+  (map dblst->dartifacts
+       (map vector->list
+	    (get-rows
+	     db
+	     "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed FROM artifacts
+                WHERE artifact_type = ? AND group_id = ? AND processed=0
+                LIMIT ? OFFSET ?;"
+	     (conc artifact-type) ;; convert symbols to string
+	     group-id
+	     limit
+	     offset
+	     ))))
+
+;; given a uuid, get not processed child artifacts 
+;;
+(define (get-related db group-id uuid)
+  (map dblst->dartifacts
+       (get-rows
+	db
+	"SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed FROM artifacts
+           WHERE parent_uuid=? AND group_id=? AND processed=0;"
+	uuid group-id)))
+
+;; generic artifact processor
+;;
+;; find all packets in group-id of type in ptypes and apply proc to artifactdat
+;;
+(define (process-artifacts conn group-id ptypes parent-uuid proc)
+  (let* ((artifacts (get-dartifacts conn ptypes group-id parent-uuid)))
+    (map proc artifacts)))
+
+;; criteria is an alist ((k . valpatt) ...)
+;;   - valpatt is a regex
+;;   - ptypes is a list of types (symbols expected)
+;;   match-type: 'any or 'all
+;;
+(define (find-artifacts db ptypes criteria #!key (processed #f)(match-type 'any)(artifact-spec #f)) ;; processed=#f, don't use, else use
+  (let* ((artifacts (get-dartifacts db ptypes 0 #f))
+	 (match-rules (lambda (artifactdat) ;; returns a list of matching rules
+			(filter (lambda (c)
+				  ;; (print "c: " c)
+				  (let* ((ctype (car c)) ;; card type
+					 (rx    (cdr c)) ;; card pattern
+					 ;; (t     (alist-ref 'artifact-type artifactdat))
+					 (artifact   (alist-ref 'artifact artifactdat))
+					 (aartifact  (artifact->alist artifact))
+					 (cdat  (alist-ref ctype aartifact)))
+				    ;; (print "cdat: " cdat) ;; " aartifact: " aartifact)
+				    (if cdat
+					(string-match rx cdat)
+					#f)))
+				criteria)))
+	 (res         (filter (lambda (artifactdat)
+				(if (null? criteria) ;; looking for all artifacts
+				    #t
+				    (case match-type
+				      ((any)(not (null? (match-rules artifactdat))))
+				      ((all)(eq? (length (match-rules artifactdat))(length criteria)))
+				      (else
+				       (print "ERROR: bad match type " match-type ", expecting any or all.")))))
+			      artifacts)))
+    (if artifact-spec
+	(dartifacts->alists res artifact-spec)
+	res)))
+
+;; get descendents of parent-uuid
+;;
+;; NOTE: Should be doing something like the following:
+;;
+;; given a uuid, get not processed child artifacts 
+;; processed:
+;;    #f => get all
+;;     0 => get not processed
+;;     1 => get processed
+;;
+(define (get-ancestors db group-id uuid #!key (processed #f))
+  (map dblst->dartifacts
+       (map vector->list
+	    (get-rows
+	     db
+	     (conc
+	      "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed 
+                FROM artifacts
+                 WHERE uuid IN 
+                     (WITH RECURSIVE
+                       tree(uuid,parent_uuid)
+                        AS
+                        (
+                           SELECT uuid, parent_uuid
+                           FROM artifacts
+                           WHERE uuid = ?
+                           UNION ALL
+                           SELECT t.uuid, t.parent_uuid
+                           FROM artifacts t
+                           JOIN tree ON t.uuid = tree.parent_uuid
+                        )
+	              SELECT uuid FROM tree)
+	    AND group_id=?" (if processed (conc " AND processed=" processed) "") ";")
+	     uuid group-id))))
+
+;; Untested
+;;
+(define (get-descendents db group-id uuid #!key (processed #f))
+  (map dblst->dartifacts
+       (map vector->list
+	    (get-rows
+	     db
+	     (conc
+	      "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed 
+                FROM artifacts
+                 WHERE uuid IN 
+                     (WITH RECURSIVE
+                       tree(uuid,parent_uuid)
+                        AS
+                        (
+                           SELECT uuid, parent_uuid
+                           FROM artifacts
+                           WHERE uuid = ?
+                           UNION ALL
+                           SELECT t.uuid, t.parent_uuid
+                           FROM artifacts t
+                           JOIN tree ON t.parent_uuid = tree.uuid
+                        )
+	              SELECT uuid FROM tree)
+	    AND group_id=?" (if processed (conc " AND processed=" processed) "") ";")
+	     uuid group-id))))
+
+;; look up descendents based on given info unless passed in a list via inlst
+;;
+;; (define (get-last-descendent db group-id uuid #!key (processed #f)(inlst #f))
+;;   (let ((descendents (or inlst (get-descendents db group-id uuid processed: processed))))
+;;     (if (null? descendents)
+;; 	#f
+;; 	(last descendents))))
+
+;;======================================================================
+;;  A R C H I V E S - always to a sqlite3 db 
+;;======================================================================
+
+;; open an archive db
+;; path: archive-dir/<year>/month.db
+;;
+#;(define (archive-open-db archive-dir)
+  (let* ((curr-time (seconds->local-time (current-seconds)))
+	 (dbpath    (conc archive-dir "/" (time->string curr-time "%Y")))
+	 (dbfile    (conc dbpath "/" (time->string curr-time "%m") ".db"))
+	 (dbexists (if (file-exists? dbfile) #t (begin (create-directory dbpath #t) #f))))
+    (let ((db (open-database dbfile)))
+      ;; (set-busy-handler! db (busy-timeout 10000))
+      (if (not dbexists) ;; NOTE: In the archive we allow duplicates and other messiness. 
+	  (execute db "CREATE TABLE IF NOT EXISTS artifacts
+                          (id           INTEGER,
+                           group_id     INTEGER,
+                           uuid         TEXT,
+                           parent_uuid  TEXT,
+                           artifact_type     TEXT,
+                           artifact          TEXT,
+                           processed    INTEGER DEFAULT 0)"))
+      db)))
+
+;; turn on transactions! otherwise this will be painfully slow
+;;
+#;(define (write-archive-artifacts src-db db artifact-ids)
+  (let ((artifacts (get-rows
+	       src-db
+	       (conc "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact FROM artifacts WHERE id IN ("
+		     (string-intersperse (map conc artifact-ids) ",") ")"))))
+    ;; (dbi:with-transaction
+    ;;  db
+     (lambda ()
+       (for-each
+	(lambda (artifact)
+	  (apply execute  db "INSERT INTO artifacts (id,group_id,uuid,parent_uuid,artifact_type,artifact)
+                               VALUES (?,?,?,?,?,?)"
+		 artifact))
+	artifacts)))) ;; )
+
+;; given a list of uuids and lists of uuids move all to
+;; the sqlite3 db for the current archive period
+;;
+#;(define (archive-artifacts conn artifact-ids archive-dir)
+  (let ((db (archive-open-db archive-dir)))
+    (write-archive-artifacts conn db artifact-ids)
+    (finalize! db))
+  ;; (pg:with-transaction
+  ;;  conn
+  ;; (lambda ()
+     (for-each
+      (lambda (id)
+	(get-one
+	 conn
+	 "DELETE FROM artifacts WHERE id=?" id))
+      artifact-ids)) ;; ))
+
+;; given a list of ids mark all as processed
+;;
+(define (mark-processed conn artifact-ids)
+  ;; (pg:with-transaction
+   ;; conn
+   ;; (lambda ()
+     (for-each
+      (lambda (id)
+	(get-one
+	 conn
+	 "UPDATE artifacts SET processed=1 WHERE id=?;" id))
+      artifact-ids)) ;; x))
+
+;; a generic artifact getter, gets from the artifacts db
+;;
+(define (get-artifacts conn ptypes)
+  (let* ((ptypes-str    (if (null? ptypes)
+			    ""
+			    (conc " WHERE artifact_type IN ('" (string-intersperse ptypes ",") "') ")))
+	 (qry-str       (conc "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed FROM artifacts" ptypes-str)))
+    (map vector->list (get-rows conn qry-str))))
+
+;; make a report of the artifacts in the db
+;; ptypes of '() gets all artifacts
+;; display-fields
+;;
+(define (make-report dest conn artifactspec display-fields . ptypes)
+  (let* (;; (conn          (dbi:db-conn (s:db)))
+	 (all-rows      (get-artifacts conn ptypes))
+	 (all-artifacts      (flatten-all
+			 all-rows
+			 artifactspec
+			 'id 'group-id 'uuid 'parent 'artifact-type 'artifact 'processed))
+	 (by-uuid       (let ((ht (make-hash-table)))
+			  (for-each
+			   (lambda (artifact)
+			     (let ((uuid (alist-ref 'uuid artifact)))
+			       (hash-table-set! ht uuid artifact)))
+			   all-artifacts)
+			  ht))
+	 (by-parent     (let ((ht (make-hash-table)))
+			  (for-each
+			   (lambda (artifact)
+			     (let ((parent (alist-ref 'parent artifact)))
+			       (hash-table-set! ht parent (cons artifact (hash-table-ref/default ht parent '())))))
+			   all-artifacts)
+			    ht))
+	 (oup           (if dest (open-output-file dest) (current-output-port))))
+    
+    (with-output-to-port
+	oup
+      (lambda ()
+	(print "digraph megatest_state_status {
+  // ranksep=0.05
+  rankdir=LR;
+  node [shape=\"box\"];
+")
+	;; first all the names
+	(for-each
+	 (lambda (artifact)
+	   (let* ((uuid        (alist-ref 'uuid artifact))
+		  (shortuuid   (substring uuid 0 4))
+		  (type        (alist-ref 'artifact-type artifact))
+		  (processed   (alist-ref 'processed artifact)))
+	     
+	     (print "\"" uuid "\" [label=\"" shortuuid ", ("
+		    type ", "
+		    (if processed "processed" "not processed") ")")
+	     (for-each
+	      (lambda (key-field)
+		(let ((val (alist-ref key-field artifact)))
+		  (if val
+		      (print key-field "=" val))))
+	      display-fields)
+	     (print "\" ];")))
+	 all-artifacts)
+	;; now for parent-child relationships
+	(for-each
+	 (lambda (artifact)
+	   (let ((uuid   (alist-ref 'uuid artifact))
+		 (parent (alist-ref 'parent artifact)))
+	     (if (not (equal? parent ""))
+		 (print "\"" parent "\" -> \"" uuid"\";"))))
+	 all-artifacts)
+
+	(print "}")
+	))
+    (if dest
+	(begin
+	  (close-output-port oup)
+	  (system "dot -Tpdf out.dot -o out.pdf")))
+    
+    ))
+
+;;======================================================================
+;; Read ref artifacts into a vector < laststr hash table > 
+;;======================================================================
+
+
+
+;;======================================================================
+;; Read/write packets to files (convience functions)
+;;======================================================================
+
+;; write alist to a artifact file
+;;
+(define (write-alist->artifact targdir dat #!key (artifactspec '())(ptype #f))
+  (let-values (((uuid artifact)(alist->artifact dat artifactspec ptype: ptype)))
+    (with-output-to-file (conc targdir "/" uuid ".artifact")
+      (lambda ()
+	(print artifact)))
+    uuid)) ;; return the uuid
+
+;; read artifact into alist
+;;
+(define (read-artifact->alist artifact-file #!key (artifactspec #f))
+  (artifact->alist (with-input-from-file
+		  artifact-file
+		read-string)
+	      artifactspec: artifactspec))
+
+;;======================================================================
+;; File utils, stuff useful for file management
+;;======================================================================
+
+(define (file-get-sha1 fname)
+  (let* ((sha1-res (run/strings (sha1sum ,fname))))
+    (car (string-split (car sha1-res)))))
+
+(define (link-or-copy srcf destf)
+  (or (handle-exceptions
+       exn
+       #f
+       (file-link srcf destf))
+      (if (file-exists? destf)
+	  (print "NOTE: destination already exists, skipping copy.")
+	  (copy-file srcf destf))))
+
+;; (define (files-diff file1 file2)
+;;   (let* ((diff-res (with-input-from-port
+;; 		       (run/port (diff "-q" ,file1 ,file2))
+;; 		     (lambda ()
+;; 		       (let* ((res (read-line)))
+;; 			 (read-lines)
+;; 			 res)))))
+;;     (car (string-split sha1-res))))
+;; 
+
+
+(define (check-same file1 file2)
+  (cond
+   ((not (and (file-exists? file1)(file-exists? file2))) #f)
+   ((not (equal? (file-size file1)(file-size file2))) #f)
+   (else
+    (let-values (((status run-ok process-id)
+		  (run (diff "-q" ,file1 ,file2))))
+      status))))
+
+(define *pcache* (make-hash-table))
+(define (get-device dir)
+  (let ((indat (or (hash-table-ref/default *pcache* dir #f)
+		   (let* ((inp (open-input-pipe (conc "df --output=source \""dir"\"")))
+			  (res (read-lines inp)))
+		     (close-input-port inp)
+		     (hash-table-set! *pcache* dir res)
+		     res))))
+    (cadr indat)))
+
+(define (same-partition? dir1 dir2)
+  (equal? (get-device dir1)(get-device dir2)))
+
+(define (link-if-same-partition file1 file2)
+  (let* ((dir1 (pathname-directory file1))
+	 (dir2 (pathname-directory file2))
+	 (f1   (pathname-file file1))
+	 (f2   (pathname-file file2)))
+    (if (same-partition? dir1 dir2)
+	(let* ((tmpname (conc "."f2"-"(current-seconds))))
+	  ;; this steps needs to be executed as actual user    
+	  (move-file file2 (conc dir1 "/" tmpname))
+	  (file-link file1 file2)
+	  (delete-file (conc dir1 "/" tmpname))))))
+
+(define (uuid-first-two-letters sha1sum)
+  (substring sha1sum 0 2))
+
+(define (uuid-remaining-letters sha1sum)
+  (let ((slen (string-length sha1sum)))
+    (substring sha1sum 2 slen)))
+
+(define (archive-dest destd sha1sum)
+  (let* ((subdir         (uuid-first-two-letters sha1sum)) ;; (substring sha1sum 0 2))
+	 ;; (slen           (string-length sha1sum))
+	 (rem            sha1sum #;(uuid-remaining-letters sha1sum)) ;; (substring sha1sum 3 slen))
+	 (full-dest-dir  (conc destd"/"subdir))
+	 (full-dest-file (conc full-dest-dir"/"rem)))
+    (if (not (directory-exists? full-dest-dir))
+	(create-directory full-dest-dir #t))
+    full-dest-file))
+
+(define (write-to-archive data destd #!optional (nextnum #f))
+  (let* ((sha1sum    (calc-sha1 data))
+	 (full-dest  (conc (archive-dest destd sha1sum)
+			   (if nextnum (conc "."nextnum) ""))))
+    (if (file-exists? full-dest)
+	(if (equal? (string-intersperse (with-input-from-file full-dest read-lines) "\n")
+		    data)
+	    (begin
+	      ;; (print "INFO: data already exists in "full-dest" and is identical")
+	      sha1sum)
+	    (let ((nextnum (if nextnum (+ nextnum 1) 0)))
+	      (print "WARN: data already exists in "full-dest" but is different! Trying again...")
+	      (write-to-archive data destd nextnum)))
+	(begin
+	  (with-output-to-file
+	      full-dest
+	    (lambda ()
+	      (print data)))
+	  sha1sum)))) ;; BUG? Does print munge data?
+
+;; copy srcf with sha1sum aabc... to aa/bc...
+;;
+(define (archive-copy srcf destd sha1sum)
+  (let* ((full-dest-file (archive-dest destd sha1sum)))
+    (let loop ((trynum 0))
+      (let ((dest-name (if (> trynum 0)
+			   (conc full-dest-file"-"trynum)
+			   full-dest-file)))
+	(cond
+	 ((not (file-exists? srcf)) #f) ;; this should be an error?
+	 ((and (file-exists? srcf)
+	       (file-exists? dest-name))
+	  (if (check-same srcf dest-name)
+	      (link-if-same-partition dest-name srcf)
+	      (loop (+ trynum 1)))) ;; collisions are rare, this protects against them
+	 ((not (file-exists? dest-name))
+	  (link-or-copy srcf dest-name))
+	 (else #f))))))
+
+;; multi-glob
+(define (multi-glob globstrs inpath)
+  ;; (print "multi-glob: "globstrs", "inpath)
+  (if (equal? inpath "")
+      globstrs
+      (let* ((parts     (string-split inpath "/" #t))
+	     (nextpart  (car parts))
+	     (remaining (string-intersperse (cdr parts) "/")))
+	(if (and (equal? nextpart "") ;; this must be a leading / meaning root directory
+	         (null? globstrs))
+	    (multi-glob '("/") remaining)
+	    (begin
+	      ;; (print "nextpart="nextpart", remaining="remaining)
+	      (apply append
+              (map (lambda (gstr)
+                      (let* ((pathstr  (conc gstr"/"nextpart))
+		   	     (pathstrs (glob pathstr)))
+		          ;; (print "pathstr="pathstr)
+		          (multi-glob pathstrs remaining)))
+	               globstrs)))))))
+	     
+
+;; perm[/user:group]:
+;;   DDD - octal perm (future expansion)
+;;   -   - use umask/defacto perms (i.e. don't actively do anything)
+;;   x   - mark as executable
+;;
+;; Cards:
+;;   file:      f perm fname
+;;   directory: d perm fname artifactid
+;;   link:      l perm lname destpath
+;;
+;;   NOTE: cards are kept as (C . "value")
+;;
+;; given a directory path, ignore list and artifact store (hash-table):
+;;  1. create sha1 tree at dest (e.g. aa/b3a7 ...)
+;;  2. create artifact for each dir
+;;    - cards for all files
+;;    - cards for files that are symlinks or executables
+;;  3. return (artifactid . artifact)
+;;
+;; NOTES:
+;;   Use destdir of #f to not create sha1 tree
+;;   Hard links will be used if srcdir and destdir appear to be same partion
+;;
+;; (alist->artifact adat aspec #!key (ptype #f))
+;;
+;;
+;; (load "../../artifacts/artifacts.scm")(import big-chicken srfi-69 artifacts)(define dirdat (make-hash-table))
+;; (capture-dir ".." ".." "/tmp/junk" '() dirdat)
+;;
+;; [procedure] (file-type FILE [LINK [ERROR]])
+;; Returns the file-type for FILE, which should be a filename, a file-descriptor or a port object. If LINK is given and true, symbolic-links are not followed:
+;; 
+;;  regular-file
+;;  directory
+;;  fifo
+;;  socket
+;;  symbolic-link
+;;  character-device
+;;  block-device
+;; Note that not all types are supported on every platform. If ERROR is given and false, then file-type returns #f if the file does not exist; otherwise, it signals an error.
+;; 
+;; 
+(define (capture-dir curr-dir src-dir dest-dir ignore-list artifacts all-seen)
+  (let* ((dir-dat (directory-fold
+		   (lambda (fname res) ;; res is a list of artifact cards
+		     (let* ((fullname   (conc curr-dir"/"fname)))
+		       ;; (print "INFO: processing "fullname)
+		       (if (hash-table-ref/default all-seen fullname #f) ;; something circular going on
+			   (begin
+			     (print "WARNING: possible circular link(s) "fullname)
+			     res)
+			   (let* ((ftype (file-type fullname #t #f)))
+			     (hash-table-set! all-seen fullname ftype)
+			     (cons
+			      (case ftype ;; get the card
+			       ((directory) ;; (directory? fullname)
+				(let* ((new-curr-dir (conc curr-dir"/"fname))
+				       (new-src-dir  (conc src-dir"/"fname)))
+				  (let* ((dir-dat (capture-dir new-curr-dir new-src-dir
+							       dest-dir ignore-list artifacts all-seen))
+					 (a-id (car dir-dat))
+					 (artf (cdr dir-dat)))
+				    (hash-table-set! artifacts a-id artf)
+				    (cons 'd (conc "- "a-id" "fname))))) ;; the card
+			       ((symbolic-link) ;; (symbolic-link? fullname)
+				(let ((ldest (read-symbolic-link fullname)))
+				  (cons 'l (conc "- "fname"/"ldest)))) ;; delimit link name from dest with /
+			       ((regular-file) ;; must be a file
+				(let* ((start      (current-seconds))
+				       (sha1sum    (file-get-sha1 fullname))
+				       (perms      (if (file-executable? fullname) "x" "-")))
+				  (let ((runtime (- (current-seconds) start)))
+				    (if (> runtime 1)
+					(print "INFO: file "fullname" took "runtime" seconds to calculate sha1.")))
+				  (if dest-dir
+				      (archive-copy fullname dest-dir sha1sum))
+				  (cons 'f (conc perms " "sha1sum" "fname))))
+			       (else
+				(print "WARNING: file "fullname" of type "ftype" is NOT supported and will converted to empty file.")
+				(let* ((sha1sum (write-to-archive "" dest-dir)))
+				  (cons 'f (conc "- "sha1sum" "fname)))))
+			      res)))))
+		       '() src-dir #:dotfiles? #t))) ;; => (values srcdir_artifact sub_artifacts_list)
+    ;; (print "dir-dat: " dir-dat)
+    (let-values (((a-id artf)
+		  (alist->artifact dir-dat '() ptype: 'd no-d: #t)))
+      (hash-table-set! artifacts a-id artf)
+      (cons a-id artf))))
+
+;; maybe move this into artifacts?
+;;
+;; currently moves *.artifact into a bundle and moves the artifacts into attic
+;; future: move artifacts under 1 meg in size into bundle up to 10 meg in size
+;;
+(define (artifact-rollup bundle-dir) ;; cfg storepath)
+  ;; (let* ((bundle-dir (calc-bundle-dir cfg storepath)))
+    (let* ((bundles   (glob (conc bundle-dir"/*.bundle")))
+	   (artifacts (glob (conc bundle-dir"/*.artifact"))))
+      (if (> (length artifacts) 30) ;; rollup only if > 30 artifacts
+	  ;; if we have unbundled artifacts, bundle them
+	  (let* ((ht     (read-artifacts-into-hash #f artifacts: artifacts))
+		 (bundle (hash-of-artifacts->bundle ht)))
+	    (write-bundle bundle bundle-dir)
+	    (create-directory (conc bundle-dir"/attic") #t)
+	    (for-each
+	     (lambda (full-fname)
+	       (let* ((fname   (pathname-strip-directory full-fname))
+		      (newname (conc bundle-dir"/attic/"fname)))
+		 (move-file full-fname newname #t)))
+	     artifacts)
+	    (conc "bundled "(length artifacts)))
+	  "not enough artifacts to bundle")))
+
+;; if destfile is a directory then calculate the sha1sum of the bundle and store it
+;; by <sha1sum>.bundle
+;;
+;; incoming dat is pure text (bundle already sorted and appended:
+;;
+(define (write-bundle bdl-data destdir)
+  (let* ((bdl-uuid  (calc-sha1 bdl-data)))
+    (with-output-to-file
+	(conc destdir"/"bdl-uuid".bundle")
+      (lambda ()
+	(print bdl-data)))))
+
+;; minimal (and hopefully fast) artifact reader
+;; TODO: Add check of shar sum.
+;;
+(define (minimal-artifact-read fname)
+  (let* ((indat (with-input-from-file fname read-lines)))
+    (if (null? indat)
+	(values #f (conc "did not find an artifact in "fname))
+	(let* ((zcard (last indat))
+	       (cardk (substring zcard 0 1))
+	       (cardv (substring zcard 2 (string-length zcard))))
+	  (if (equal? cardk "Z")
+	      (values cardv (string-intersperse indat "\n"))
+	      (values #f (conc fname" is not a valid artifact")))))))
+
+;; read artifacts from directory into hash
+;; NOTE: support for max-count not implemented yet
+;;
+(define (read-artifacts-into-hash dir #!key (artifacts #f) (max-count #f)(ht #f))
+  (let* ((artifacts (or artifacts
+			(glob (conc dir"/*.artifact"))))
+	 (ht        (or ht (make-hash-table))))
+    (for-each
+     (lambda (fname)
+       (let-values (((uuid afct)
+		     (minimal-artifact-read fname)))
+	 (hash-table-set! ht uuid afct)))
+     artifacts)
+    ht))
+
+;; ht is:
+;;   uuid => artifact text
+;; use write-bundle to put result into a bundle file
+;;
+(define (hash-of-artifacts->bundle ht)    
+  (fold (lambda (k res)
+	  (let* ((v (hash-table-ref ht k)))
+	    (if res
+		(conc res"\n"v)
+		v)))
+	#f
+	(sort (hash-table-keys ht) string<=?)))
+
+;; minimal artifact to alist
+;;
+(define (minimal-artifact->alist afact)
+  (let* ((lines   (string-split afact "\n")))
+    (map (lambda (a)
+	   (let* ((key (string->symbol (substring a 0 1)))
+		  (sl  (string-length a))
+		  (val (if (> sl 2)
+			   (substring a 2 sl)
+			   "")))
+	     (cons key val)))
+	 lines)))
+
+;; some accessors for common cards
+(define (afact-get-D afact)
+  (let ((dval (alist-ref 'D afact)))
+    (if dval
+	(string->number dval)
+	#f)))
+
+(define (afact-get-T afact) ;; get the artifact type as a symbol
+  (let ((val (alist-ref 'T afact)))
+    (if val
+	(string->symbol val)
+	val)))
+
+(define (afact-get-Z afact)
+  (alist-ref 'Z afact))
+
+(define (afact-get afact key default)
+  (or (alist-ref key afact)
+      default))
+
+(define (afact-get-number/default afact key default)
+  (let ((val (alist-ref key afact)))
+    (if val
+	(or (string->number val) default) ;; seems wrong
+	default)))
+
+;; bundles are never big and reading into memory for processing is fine
+;;
+(define (read-bundle srcfile #!optional (mode 'uuid-raw))
+  (let* ((indat (with-input-from-file srcfile read-lines)))
+    (let loop ((tail indat)
+	       (dat '())  ;; artifact being extracted
+	       (res '())) ;; list of artifacts
+      (if (null? tail)
+	  (reverse res) ;; last dat should be empty list
+	  (let* ((curr-line (car tail)))
+	    (let-values (((ctype cdata)
+			  (card->type/value curr-line)))
+	      (let* ((is-z-card (eq? 'Z ctype))
+		     (new-dat   (cons (case mode
+					((uuid-raw) curr-line)
+					(else       (cons ctype cdata)))
+				      dat)))
+		(if is-z-card
+		    (loop (cdr tail) ;; done with this artifact
+			  '()
+			  (cons (case mode
+				  ((uuid-raw) (cons cdata (string-intersperse (reverse new-dat) "\n")))
+				  (else       (reverse new-dat)))
+				res))
+		    (loop (cdr tail)
+			  new-dat
+			  res)))))))))
+
+
+;; find all .bundle and .artifacts files in bundle-dir
+;; and inport them into sqlite handle adb
+;; 
+(define (refresh-artifacts-db adb bundle-dir)
+  (let* ((bundles   (glob (conc bundle-dir"/*.bundle")))
+	 (artifacts (glob (conc bundle-dir"/*.artifact")))
+	 (uuids     (get-all-uuids adb 'hash)))
+    (with-transaction
+     adb
+     (lambda ()
+       (for-each
+	(lambda (bundle-file)
+	  ;; (print "Importing artifacts from "bundle-file)
+	  (let* ((bdat (read-bundle bundle-file 'uuid-raw))
+		 (count 0)
+		 (inc   (lambda ()(set! count (+ count 1)))))
+	    (for-each
+	     (lambda (adat)
+	       (match
+		adat
+		((zval . artifact)
+		 (if (not (hash-table-exists? uuids zval))
+		     (begin
+		       ;; (print "INFO: importing new artifact "zval" from bundle "bundle-file)
+		       (inc)
+		       (execute adb "INSERT INTO artifacts (uuid, artifact) VALUES (?,?);"
+				zval artifact)
+		       (hash-table-set! uuids zval #t))))
+		(else
+		 (print "ERROR: Bad artifact data "adat))))
+	     bdat)
+	    (print "INFO: imported "count" artifacts from "bundle-file)))
+	bundles)
+       (for-each
+	(lambda (artifact-file)
+	  ;; (print "Importing artifact from "artifact-file)
+	  (let-values (((uuid artifact) (minimal-artifact-read artifact-file)))
+	    (if uuid
+		(if (not (hash-table-exists? uuids uuid))
+		    (begin
+		      ;; (print "INFO: importing new artifact "uuid" from "artifact-file)
+		      (execute adb "INSERT INTO artifacts (uuid, artifact) VALUES (?,?);"
+			       uuid artifact)
+		      (hash-table-set! uuids uuid #t)))
+		(print "Bad artifact in "artifact-file))))
+	artifacts)))))
+
+;;======================================================================
+;;  Artifacts db cache
+;;======================================================================
+
+;; artifacts
+;;   id SERIAL PRIMARY KEY,
+;;   uuid TEXT NOT NULL,
+;;   artifact TEXT NOT NULL
+;;
+;; parents
+;;   id INTEGER REFERENCES artids.id,  -- 
+;;   parent_id  REFERENCES artids.id
+;;
+;; schema is list of SQL statements - can be used to extend db with more tables
+;;
+(define (open-artifacts-db dbpath dbfile #!key (schema '()))
+  (let* ((dbfname  (conc dbpath "/" dbfile))
+	 (dbexists (if (file-exists? dbfname) #t (begin (create-directory dbpath #t) #f)))
+	 (adb      (open-database dbfname)))
+    (set-busy-handler! adb (make-busy-timeout 10000))
+    (execute adb "PRAGMA synchronous = 0;")
+    (if (not dbexists)
+	(with-transaction
+	 adb
+	 (lambda ()
+	   (for-each
+	    (lambda (stmt)
+	      (execute adb stmt))
+	    (append `("CREATE TABLE IF NOT EXISTS artifacts
+                          (id           INTEGER PRIMARY KEY,
+                           uuid         TEXT NOT NULL,
+                           artifact     TEXT NOT NULL)"
+		      
+		      "CREATE TABLE IF NOT EXISTS parents
+                          (id INTEGER REFERENCES artifacts(id) NOT NULL,
+                           parent_id INTEGER REFERENCES artifacts(id) NOT NULL)")
+		    schema)))))
+    adb))
+
+(define (generate-year-month-name #!optional (seconds #f))
+  (let* ((curr-time (seconds->local-time (or seconds (current-seconds)))))
+    (time->string curr-time "%Y%m")))
+
+;; I don't like this function. TODO: remove the
+;; mode and option to return ht. Use instead the
+;; get-all-artifacts below
+;;
+(define (get-all-uuids adb #!optional (mode #f))
+  (let* ((res (fold-row
+	       (lambda (res uuid)
+		 (cons uuid res))
+	       '()
+	       adb
+	       "SELECT uuid FROM artifacts;")))
+    (case mode
+      ((hash)
+       (let* ((ht (make-hash-table)))
+	 (for-each
+	  (lambda (uuid)
+	    (hash-table-set! ht uuid #t))
+	  res)
+	 ht))
+      (else res))))
+
+;; returns raw artifacts (i.e. NOT alists but instead plain text)
+(define (get-all-artifacts adb)
+  (let* ((ht  (make-hash-table)))
+    (for-each-row
+     (lambda (id uuid artifact)
+       (hash-table-set! ht uuid `(,id ,uuid ,artifact)))
+     adb
+     "SELECT id,uuid,artifact FROM artifacts;")
+    ht))
+
+;; given a bundle-dir copy or create to /tmp and open
+;; the YYMM.db file and hand the handle to the given proc
+;; NOTE: we operate in /tmp/ to accomodate users on NFS
+;; where slamming Unix locks at an NFS filer can cause
+;; locking fails. Eventually this /tmp behavior will be
+;; configurable.
+;;
+(define (with-todays-adb bundle-dir proc)
+  (let* ((dbname   (conc (generate-year-month-name) ".db"))
+	 (destname (conc bundle-dir"/"dbname))
+	 (tmparea  (conc "/tmp/"(current-user-name)"-"(calc-sha1 bundle-dir)))
+	 (tmpname  (conc tmparea"/"dbname))
+	 (lockfile (conc destname".update-in-progress")))
+    ;; (print "with-todays-adb, bundle-dir: "bundle-dir", dbname: "dbname", destname: "destname",\n    tmparea: " tmparea", lockfile: "lockfile)
+    (if (not (file-exists? tmparea))(create-directory tmparea #t))
+    (let loop ((count 0))
+      (if (file-exists? lockfile)
+	  (if (< count 30) ;; aproximately 30 seconds
+	      (begin
+		(sleep 1)
+		(loop (+ 1 count)))
+	      (print "ERROR: "lockfile" exists, proceeding anyway"))
+	  (if (file-exists? destname)
+	      (begin
+		(copy-file destname tmpname #t)
+		(copy-file destname lockfile #t)))))
+    (let* ((adb  (open-artifacts-db tmparea dbname))
+	   (res  (proc adb)))
+      (finalize! adb)
+      (copy-file tmpname destname #t)
+      (delete-file* lockfile)
+      res)))
+
+) ;; module artifacts
+
+;; ATTIC
+

ADDED   artifacts/artifacts.setup
Index: artifacts/artifacts.setup
==================================================================
--- /dev/null
+++ artifacts/artifacts.setup
@@ -0,0 +1,11 @@
+;; Copyright 2007-2017, Matthew Welland.
+;;
+;;  This program is made available under the GNU GPL version 2.0 or
+;;  greater. See the accompanying file COPYING for details.
+;;
+;;  This program is distributed WITHOUT ANY WARRANTY; without even the
+;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+;;  PURPOSE.
+
+;;;; pkts.setup
+(standard-extension 'pkts "1.0")

ADDED   artifacts/artifactsrec.scm
Index: artifacts/artifactsrec.scm
==================================================================
--- /dev/null
+++ artifacts/artifactsrec.scm
@@ -0,0 +1,196 @@
+(define-syntax define-record-type
+  (syntax-rules ()
+    ((define-record-type type
+       (constructor constructor-tag ...)
+       predicate
+       (field-tag accessor . more) ...)
+     (begin
+       (define type
+         (make-record-type 'type '(field-tag ...)))
+       (define constructor
+         (record-constructor type '(constructor-tag ...)))
+       (define predicate
+         (record-predicate type))
+       (define-record-field type field-tag accessor . more)
+       ...))))
+
+; An auxilliary macro for define field accessors and modifiers.
+; This is needed only because modifiers are optional.
+
+(define-syntax define-record-field
+  (syntax-rules ()
+    ((define-record-field type field-tag accessor)
+     (define accessor (record-accessor type 'field-tag)))
+    ((define-record-field type field-tag accessor modifier)
+     (begin
+       (define accessor (record-accessor type 'field-tag))
+       (define modifier (record-modifier type 'field-tag))))))
+
+; Record types
+
+; We define the following procedures:
+; 
+; (make-record-type <type-name <field-names>)    -> <record-type>
+; (record-constructor <record-type<field-names>) -> <constructor>
+; (record-predicate <record-type>)               -> <predicate>
+; (record-accessor <record-type <field-name>)    -> <accessor>
+; (record-modifier <record-type <field-name>)    -> <modifier>
+;   where
+; (<constructor> <initial-value> ...)         -> <record>
+; (<predicate> <value>)                       -> <boolean>
+; (<accessor> <record>)                       -> <value>
+; (<modifier> <record> <value>)         -> <unspecific>
+
+; Record types are implemented using vector-like records.  The first
+; slot of each record contains the record's type, which is itself a
+; record.
+
+(define (record-type record)
+  (record-ref record 0))
+
+;----------------
+; Record types are themselves records, so we first define the type for
+; them.  Except for problems with circularities, this could be defined as:
+;  (define-record-type :record-type
+;    (make-record-type name field-tags)
+;    record-type?
+;    (name record-type-name)
+;    (field-tags record-type-field-tags))
+; As it is, we need to define everything by hand.
+
+(define :record-type (make-record 3))
+(record-set! :record-type 0 :record-type)	; Its type is itself.
+(record-set! :record-type 1 ':record-type)
+(record-set! :record-type 2 '(name field-tags))
+
+; Now that :record-type exists we can define a procedure for making more
+; record types.
+
+(define (make-record-type name field-tags)
+  (let ((new (make-record 3)))
+    (record-set! new 0 :record-type)
+    (record-set! new 1 name)
+    (record-set! new 2 field-tags)
+    new))
+
+; Accessors for record types.
+
+(define (record-type-name record-type)
+  (record-ref record-type 1))
+
+(define (record-type-field-tags record-type)
+  (record-ref record-type 2))
+
+;----------------
+; A utility for getting the offset of a field within a record.
+
+(define (field-index type tag)
+  (let loop ((i 1) (tags (record-type-field-tags type)))
+    (cond ((null? tags)
+           (error "record type has no such field" type tag))
+          ((eq? tag (car tags))
+           i)
+          (else
+           (loop (+ i 1) (cdr tags))))))
+
+;----------------
+; Now we are ready to define RECORD-CONSTRUCTOR and the rest of the
+; procedures used by the macro expansion of DEFINE-RECORD-TYPE.
+
+(define (record-constructor type tags)
+  (let ((size (length (record-type-field-tags type)))
+        (arg-count (length tags))
+        (indexes (map (lambda (tag)
+                        (field-index type tag))
+                      tags)))
+    (lambda args
+      (if (= (length args)
+             arg-count)
+          (let ((new (make-record (+ size 1))))
+            (record-set! new 0 type)
+            (for-each (lambda (arg i)
+			(record-set! new i arg))
+                      args
+                      indexes)
+            new)
+          (error "wrong number of arguments to constructor" type args)))))
+
+(define (record-predicate type)
+  (lambda (thing)
+    (and (record? thing)
+         (eq? (record-type thing)
+              type))))
+
+(define (record-accessor type tag)
+  (let ((index (field-index type tag)))
+    (lambda (thing)
+      (if (and (record? thing)
+               (eq? (record-type thing)
+                    type))
+          (record-ref thing index)
+          (error "accessor applied to bad value" type tag thing)))))
+
+(define (record-modifier type tag)
+  (let ((index (field-index type tag)))
+    (lambda (thing value)
+      (if (and (record? thing)
+               (eq? (record-type thing)
+                    type))
+          (record-set! thing index value)
+          (error "modifier applied to bad value" type tag thing)))))
+
+Records
+
+; This implements a record abstraction that is identical to vectors,
+; except that they are not vectors (VECTOR? returns false when given a
+; record and RECORD? returns false when given a vector).  The following
+; procedures are provided:
+;   (record? <value>)                -> <boolean>
+;   (make-record <size>)             -> <record>
+;   (record-ref <record> <index>)    -> <value>
+;   (record-set! <record> <index> <value>) -> <unspecific>
+;
+; These can implemented in R5RS Scheme as vectors with a distinguishing
+; value at index zero, providing VECTOR? is redefined to be a procedure
+; that returns false if its argument contains the distinguishing record
+; value.  EVAL is also redefined to use the new value of VECTOR?.
+
+; Define the marker and redefine VECTOR? and EVAL.
+
+(define record-marker (list 'record-marker))
+
+(define real-vector? vector?)
+
+(define (vector? x)
+  (and (real-vector? x)
+       (or (= 0 (vector-length x))
+	   (not (eq? (vector-ref x 0)
+		record-marker)))))
+
+; This won't work if ENV is the interaction environment and someone has
+; redefined LAMBDA there.
+
+(define eval
+  (let ((real-eval eval))
+    (lambda (exp env)
+      ((real-eval `(lambda (vector?) ,exp))
+       vector?))))
+
+; Definitions of the record procedures.
+
+(define (record? x)
+  (and (real-vector? x)
+       (< 0 (vector-length x))
+       (eq? (vector-ref x 0)
+            record-marker)))
+
+(define (make-record size)
+  (let ((new (make-vector (+ size 1))))
+    (vector-set! new 0 record-marker)
+    new))
+
+(define (record-ref record index)
+  (vector-ref record (+ index 1)))
+
+(define (record-set! record index value)
+  (vector-set! record (+ index 1) value))

ADDED   artifacts/tests/run.scm
Index: artifacts/tests/run.scm
==================================================================
--- /dev/null
+++ artifacts/tests/run.scm
@@ -0,0 +1,139 @@
+(use test)
+
+;; (use (prefix pkts pkts:))
+(use pkts (prefix dbi dbi:))
+;; (use trace)(trace sdat->alist pkt->alist)
+
+(if (file-exists? "queue.db")(delete-file "queue.db"))
+
+(test-begin "pkts and pkt archives")
+
+;;======================================================================
+;; Basic pkt creation, parsing and conversion routines
+;;======================================================================
+
+(test-begin "basic packets")
+(test #f '(A "This is a packet") (let-values (((t v)
+					       (card->type/value "A This is a packet")))
+				   (list t v)))
+(test #f "A A\nZ 664449e7299e0065a3e25c138ccef2df13ba291e"
+      (let-values (((uuid res)
+		    (add-z-card '("A A"))))
+	res))
+(test #f '("CC C++" "D 1486332719.0" "a A" "b C")(sort (construct-sdat 'b "C" 'a "A" 'CC "C++" 'D 1486332719.0)
+						       string<=?))
+(define pkt-example #f)
+(test #f "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84"
+      (let-values (((uuid res)
+		    (construct-pkt 'b "C" 'a "A" 'CC "C++" 'D 1486332719.0)))
+	(set! pkt-example (cons uuid res))
+	res))
+(test-end "basic packets")
+
+;;======================================================================
+;; Sqlite and postgresql based queue of pkts
+;;======================================================================
+
+(test-begin "pkt queue")
+(define db #f)
+(test #f 'sqlite3 (let ((dbh (open-queue-db "." "queue.db")))
+		    (set! db dbh)
+		    (dbi:db-dbtype dbh)))
+(test #f (cdr pkt-example)
+      (begin
+	(add-to-queue db (cdr pkt-example)(car pkt-example) 'basic #f 0)
+	(lookup-by-uuid db (car pkt-example) 0)))
+(test #f (cdr pkt-example)
+      (lookup-by-id db 1))
+(test #f 1 (length (find-pkts db '(basic) '())))
+
+(test-end "pkt queue")
+
+
+;;======================================================================
+;; Process groups of pkts
+;;======================================================================
+
+(test-begin "lists of packets")
+(test #f '((apkt . #f) (id . 1) (group-id . 2) (uuid . 3) (parent . 4) (pkt-type . 5)) ;; ((id . 1) (group-id . 2) (uuid . 3) (parent . 4) (pkt-type . 5))
+      (dblst->dpkts '(1 2 3 4 5)))
+(test #f '(((apkt (Z . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (b . "C") (a . "A") (D . "1486332719.0") (C . " C++")) (id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (processed . 0)))
+      ;; '(((apkt (Z . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (b . "C") (a . "A") (D . "1486332719.0") (C . " C++")) (id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+      ;; '(((id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+      (get-dpkts db '(basic) 0 #f))
+(test #f '(((apkt (Z . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (b . "C") (a . "A") (D . "1486332719.0") (C . " C++")) (id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (processed . 0)))
+      ;; '(((apkt (Z . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (b . "C") (a . "A") (D . "1486332719.0") (C . " C++")) (id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+      ;; '(((id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+      (get-not-processed-pkts db 0 'basic 1000 0))
+(test-end "lists of packets")
+
+(test-begin "pkts as alists")
+(define pktspec '((posting . ((title . t)   ;; NOTE: same as ((posting (title . t)(url . u)(blub . b)) ... 
+			      (url   . u)
+			      (blurb . b)))
+		  (comment . ((comment . c)
+			      (score   . s)))
+		  (basic   . ((b-field . b)
+			      (a-field . a)))))
+(define pktlst (find-pkts db '(basic) '()))
+(define dpkt (car pktlst))
+(test #f "A" (get-value 'a-field dpkt pktspec))
+
+(test #f "C" (alist-ref 'b-field (dpkt->alist dpkt pktspec)))
+
+(define basic-spec '((nada (foo . b)(bar . f))(basic (foo . f)(bar . b))))
+(define test-pkt   '((foo . "fooval")(bar . "barval")))
+(let*-values (((u p)  (alist->pkt test-pkt basic-spec ptype: 'basic))
+		((apkt) (pkt->alist p))
+		((bpkt) (pkt->alist p pktspec: basic-spec)))
+    (test #f "fooval" (alist-ref 'f apkt))
+    (test #f "fooval" (alist-ref 'foo bpkt))
+    (test #f #f       (alist-ref 'f   bpkt)))
+
+(test-end "pkts as alists")
+
+(test-begin "descendents and ancestors")
+
+(define (get-uuid pkt)(alist-ref 'uuid pkt))
+
+;; add a child to 263e
+(let-values (((uuid pkt)
+	      (construct-pkt 'x "X" 'y "Y" 'P "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84"
+			     'D "1486332719.0")))
+  (add-to-queue db pkt uuid 'basic "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84" 0))
+
+(test #f '("263eb3b6193de7fe65b1ded5bcda513e8b4d6b84" "818fe30988c9673441b8f203972a8bda6af682f8")
+      (map (lambda (x)(alist-ref 'uuid x))
+	   (get-descendents
+	    db 0
+	    "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+
+(test #f '("263eb3b6193de7fe65b1ded5bcda513e8b4d6b84" "818fe30988c9673441b8f203972a8bda6af682f8")
+      (map (lambda (x)(alist-ref 'uuid x))
+	   (get-ancestors
+	    db 0
+	    "818fe30988c9673441b8f203972a8bda6af682f8")))
+
+(test-end "descendents and ancestors")
+
+(test-end "pkts and pkt archives")
+
+(test-begin "pktsdb")
+
+(define spec '((tests (testname n TEXT)
+		      (testpath p TEXT)
+		      (duration d INTEGER))))
+;; (define pktsdb (make-pktdb))
+;; (pktdb-pktsdb-spec-set! pktsdb spec)
+
+(define pktsdb #f)
+
+(test #f #t (dbi:database? (let ((pdb (pktsdb-open "test.db" spec)))
+			     (set! pktsdb pdb)
+			     (pktdb-conn pdb))))
+;; (pp (pktdb-pktspec pktsdb))
+(test #f #t (pktsdb-add-record pktsdb 'tests '((testname . "test1"))))
+
+(pktsdb-close pktsdb)
+
+(test-end "pktsdb")

Index: client.scm
==================================================================
--- client.scm
+++ client.scm
@@ -27,10 +27,19 @@
 (declare (unit client))
 
 (declare (uses common))
 (declare (uses db))
 (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.
+(declare (uses commonmod))
+(import commonmod)
+
+(module client
+*
+
+)
+
+(import client)
 
 (include "common_records.scm")
 (include "db_records.scm")
 
 ;; client:get-signature
@@ -44,13 +53,10 @@
 #;(define (client:logout serverdat)
   (let ((ok (and (socket? serverdat)
 		 (cdb:logout serverdat *toppath* (client:get-signature)))))
     ok))
 
-(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0))
-  (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects))
-
 ;; Do all the connection work, look up the transport type and set up the
 ;; connection if required.
 ;;
 ;; There are two scenarios. 
 ;;   1. We are a test manager and we received *transport-type* and *runremote* via cmdline
@@ -60,17 +66,27 @@
 ;; client:setup
 ;;
 ;; lookup_server, need to remove *runremote* stuff
 ;;
  
-(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f))
+;;(define (http-transport:server-dat-make-url runremote)
+(define (client:get-url runremote)
+  (if (and (remote-iface runremote)
+	   (remote-port  runremote))
+      (conc "http://" 
+	    (remote-iface runremote)
+	    ":"
+	    (remote-port  runremote))
+      #f))
+
+(define (client:setup areapath runremote #!key (remaining-tries 100) (failed-connects 0)(area-dat #f))
   (mutex-lock! *rmt-mutex*)
-  (let ((res (client:setup-http-baby areapath remaining-tries: remaining-tries failed-connects: failed-connects area-dat: area-dat)))
+  (let ((res (client:setup-http areapath runremote remaining-tries: remaining-tries failed-connects: failed-connects area-dat: area-dat)))
     (mutex-unlock! *rmt-mutex*)
     res))
 
-(define (client:setup-http-baby areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f))
+(define (client:setup-http areapath runremote #!key (remaining-tries 100) (failed-connects 0))
   (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries)
   (server:start-and-wait areapath)
   (if (<= remaining-tries 0)
       (begin
 	(debug:print-error 0 *default-log-port* "failed to start or connect to server")
@@ -77,52 +93,72 @@
 	(exit 1))
       ;;
       ;; Alternatively here, we can get the list of candidate servers and work our way
       ;; through them searching for a good one.
       ;;
-      (let* ((server-dat (server:choose-server areapath 'best))
-	     (runremote  (or area-dat *runremote*)))
+      (let* ((server-dat (server:choose-server areapath 'best))) ;; list host port start-time server-id pid
+;;	     (runremote  (or area-dat *runremote*)))
 	(if (not server-dat) ;; no server found
-	    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
+	    (begin
+	      (if (< remaining-tries 99)(thread-sleep! 1)) ;; obviously it needs time
+	      (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1)))
 	    (match server-dat
 	      ((host port start-time server-id pid)
 	       (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
-	       (if (and (not area-dat)
-			(not *runremote*))
-                   (begin       
-		     (set! *runremote* (make-remote))
-                     (let* ((server-info (remote-server-info *runremote*))) 
+	       (if (not runremote)
+                   (begin
+		     ;; Here we are creating a runremote where there was none or it was clobbered with #f
+		     ;;
+		     (set! runremote (make-and-init-remote))
+                     (let* ((server-info (server:check-if-running areapath)))
+		       (remote-server-info-set! runremote server-info)
                        (if server-info
                            (begin
-                             (remote-server-url-set! *runremote* (server:record->url server-info))
-                             (remote-server-id-set! *runremote* (server:record->id server-info)))))))
+                             (remote-server-url-set! runremote (server:record->url server-info))
+                             (remote-server-id-set! runremote (server:record->id server-info)))))))
+	       ;; at this point we have a runremote
 	       (if (and host port server-id)
-		   (let* ((start-res (http-transport:client-connect host port server-id))
-			  (ping-res  (rmt:login-no-auto-client-setup start-res)))
-		     (if (and start-res
-			      ping-res)
-			 (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago
-			   (if runremote
-			       (begin
-				 (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res)
-				 (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
-				 start-res)
-			       (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))
+		   (let* ((nada     (client:connect host port server-id runremote))
+			  (ping-res (rmt:login-no-auto-client-setup runremote)))
+		     (if ping-res
+			 (if runremote
+			     (begin
+			       (debug:print-info 2 *default-log-port* "connected to " (client:get-url runremote))
+			       runremote)
+			     (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1)))
 			 (begin    ;; login failed but have a server record, clean out the record and try again
-			   (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid.  Fixes part of Randy;s ticket 1405717332
-			   (case *transport-type* 
-			     ((http)(http-transport:close-connections)))
-                           (if *runremote* 
-			       (remote-conndat-set! runremote #f)  ;; (hash-table-delete! runremote run-id)
-                               )
+			   (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... ping-res=" ping-res ", server-dat=" server-dat) ;; had runid.  Fixes part of Randy;s ticket 1405717332
+			   (http-transport:close-connections runremote)
 			   (thread-sleep! 1)
-			   (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
+			   (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))
 			   )))
 		   (begin    ;; no server registered
 		     ;; (server:kind-run areapath)
 		     (server:start-and-wait areapath)
 		     (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
 		     (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
-		     (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))))
+		     (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1)))))
 	      (else
 	       (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat)))))))
 
+;;
+;; connect - stored in remote-condat
+;;
+;; (define (http-transport:client-connect iface port server-id runremote)
+(define (client:connect iface port server-id runremote-in)
+  (let* ((runremote (or runremote-in
+			(make-runremote))))
+    (debug:print-info 2 *default-log-port* "Connecting to server at "iface":"port", id "server-id)
+    (let* ((api-url      (conc "http://" iface ":" port "/api"))
+	   (api-uri      (uri-reference (conc "http://" iface ":" port "/api")))
+	   (api-req      (make-request method: 'POST uri: api-uri)))
+      ;;	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds) server-id)))
+      (remote-iface-set!   runremote iface)
+      (remote-port-set!    runremote port)
+      (remote-server-id-set! runremote server-id)
+      (remote-connect-time-set! runremote (current-seconds))
+      (remote-last-access-set! runremote (current-seconds))
+      (remote-api-url-set! runremote api-url)
+      (remote-api-uri-set! runremote api-uri)
+      (remote-api-req-set! runremote api-req)
+      runremote)))
+

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -24,10 +24,12 @@
      matchable regex posix (srfi 18) extras ;; tcp 
      (prefix nanomsg nmsg:)
      (prefix sqlite3 sqlite3:)
      pkts (prefix dbi dbi:)
      )
+(use posix-extras pathname-expand files)
+
 
 (declare (unit common))
 (declare (uses commonmod))
 (import commonmod)
 
@@ -210,12 +212,10 @@
 
 ;; Miscellaneous
 (define *triggers-mutex*     (make-mutex))     ;; block overlapping processing of triggers
 (define *numcpus-cache* (make-hash-table))
 
-(use posix-extras pathname-expand files)
-
 ;; this plugs a hole in posix-extras in recent chicken versions > 4.9)
 (let-values (( (chicken-release-number chicken-major-version)
                (apply values
                       (map string->number
                            (take
@@ -249,33 +249,10 @@
 (define (common:get-sync-lock-filepath)
   (let* ((tmp-area     (common:get-db-tmp-area))
          (lockfile     (conc tmp-area "/megatest.db.sync-lock")))
     lockfile))
 
-;;======================================================================
-;; when called from a wrapper I need sometimes to find the calling
-;; wrapper, this is for dashboard to find the correct megatest.
-;;
-(define (common:find-local-megatest #!optional (progname "megatest"))
-  (let ((res (filter file-exists?
-		     (map (lambda (updir)
-			    (let* ((lm  (car (argv)))
-				   (dir (pathname-directory lm))
-				   (exe (pathname-strip-directory lm)))
-			      (conc (if dir (conc dir "/") "")
-				    (case (string->symbol exe)
-				      ((dboard)    (conc updir progname))
-				      ((mtest)     (conc updir progname))
-				      ((dashboard) progname)
-				      (else exe)))))
-			  '("../../" "../")))))
-    (if (null? res)
-	(begin
-	  (debug:print 0 *default-log-port* "Failed to find this executable! Using what can be found on the path")
-	  progname)
-	(car res))))
-
 (define *common:logpro-exit-code->status-sym-alist*
   '( ( 0 . pass )
      ( 1 . fail )
      ( 2 . warn )
      ( 3 . check )
@@ -315,26 +292,39 @@
     (else "FAIL")))
 
 (define (common:logpro-exit-code->test-status exit-code)
   (status-sym->string (common:logpro-exit-code->status-sym exit-code)))
 
+;; 
 (defstruct remote
+
+  ;; transport to be used
+  ;; http              - use http-transport
+  ;; http-read-cached  - use http-transport for writes but in-mem cached for reads
+  (rmode            'http)
   (hh-dat            (let ((res (or (server:choose-server *toppath* 'homehost)
 				    (cons #f #f))))
 		       (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res))
 		       res))
   (server-url        #f) ;; (server:check-if-running *toppath*) #f))
   (server-id         #f)
-  (server-info       (if *toppath* (server:check-if-running *toppath*) #f))
+  (server-info       #f) ;; (if *toppath* (server:check-if-running *toppath*) #f))
   (last-server-check 0)  ;; last time we checked to see if the server was alive
-  (connect-time      (current-seconds))
-  (conndat           #f)
-  (transport         *transport-type*)
+  (connect-time      (current-seconds)) ;; when we first connected
+  (last-access       (current-seconds)) ;; last time we talked to server
+  ;; (conndat           #f) ;; iface port api-uri api-url api-req seconds server-id
   (server-timeout    (server:expiration-timeout))
   (force-server      #f)
   (ro-mode           #f)  
-  (ro-mode-checked   #f)) ;; flag that indicates we have checked for ro-mode
+  (ro-mode-checked   #f) ;; flag that indicates we have checked for ro-mode
+
+  ;; conndat stuff
+  (iface             #f) ;; TODO: Consolidate this data with server-url and server-info above
+  (port              #f)
+  (api-url           #f)
+  (api-uri           #f)
+  (api-req           #f))
 
 ;; launching and hosts
 (defstruct host
   (reachable    #f)
   (last-update  0)
@@ -408,27 +398,48 @@
 
 (define (common:version-changed?)
   (not (equal? (common:get-last-run-version)
                (common:version-signature))))
 
+
+;; From 1.70 to 1.80, db's are compatible.
+
 (define (common:api-changed?)
-  (not (equal? (substring (->string megatest-version) 0 4)
-               (substring (conc (common:get-last-run-version)) 0 4))))
+  (let* (
+    (megatest-major-version (substring (->string megatest-version) 0 4))
+    (run-major-version (substring (conc (common:get-last-run-version)) 0 4))
+   )
+   (and (not (equal? megatest-major-version "1.80"))
+     (not (equal? megatest-major-version megatest-run-version)))
+  )
+)
 
 ;;======================================================================
 ;; Move me elsewhere ...
 ;; RADT => Why do we meed the version check here, this is called only if version misma
 ;;
 (define (common:cleanup-db dbstruct #!key (full #f))
-  (apply db:multi-db-sync 
-   dbstruct
-   'schema
-   'killservers
-   'adj-target
-   'new2old
-   '(dejunk)
-  )
+  (case (rmt:transport-mode)
+    ((http)
+     (apply db:multi-db-sync 
+	    dbstruct
+	    'schema
+	    'killservers
+	    'adj-target
+	    'new2old
+	    '(dejunk)
+	    ))
+    ((tcp nfs)
+     (debug:print 0 *default-log-port* "WARNING: cleanup-db NOT implemented yet for tcp and nfs.")
+     #;(apply db:multi-db-sync 
+	    dbstruct
+	    'schema
+	    'killservers
+	    'adj-target
+	    'new2old
+	    '(dejunk)
+	    )))
   (if (common:api-changed?)
       (common:set-last-run-version)))
 
 (define (common:snapshot-file filepath #!key (subdir  ".") )
   (if (file-exists? filepath)
@@ -520,11 +531,11 @@
 (define (common:rotate-logs)
   (let* ((all-files (make-hash-table))
 	 (stats     (make-hash-table))
 	 (inc-stat  (lambda (key)
 		      (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1))))
-	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
+	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age
     (if (not (directory-exists? "logs"))(create-directory "logs"))
     (directory-fold 
      (lambda (file rem)
        (handle-exceptions
 	exn
@@ -599,14 +610,15 @@
 ;;======================================================================
 ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
 ;; Do NOT check if not on homehost!
 ;;
 (define (common:exit-on-version-changed)
-  (if (common:on-homehost?)
+  (if (and *toppath*              ;; do nothing if *toppath* not yet provided
+	   (common:on-homehost?))
       (if (common:api-changed?)
 	  (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config"))
-                 (dbfile  (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db"))
+                 (dbfile  (conc (get-environment-variable "MT_RUN_AREA_HOME") ".megatest/main.db"))
                  (read-only (not (file-write-access? dbfile)))
                  (dbstruct (db:setup #t))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;;  #t)))
 	    (debug:print 0 *default-log-port*
 			 "WARNING: Version mismatch!\n"
 			 "   expected: " (common:version-signature) "\n"
@@ -626,14 +638,14 @@
                (common:cleanup-db dbstruct)))
              ((not (common:file-exists? mtconf))
               (debug:print 0 *default-log-port* "   megatest.config does not exist in this area.  Cannot proceed with megatest version migration.")
               (exit 1))
              ((not (common:file-exists? dbfile))
-              (debug:print 0 *default-log-port* "   megatest.db does not exist in this area.  Cannot proceed with megatest version migration.")
+              (debug:print 0 *default-log-port* "   .megatest/main.db does not exist in this area.  Cannot proceed with megatest version migration.")
               (exit 1))
              ((not (eq? (current-user-id)(file-owner mtconf)))
-              (debug:print 0 *default-log-port* "   You do not own megatest.db in this area.  Cannot proceed with megatest version migration.")
+              (debug:print 0 *default-log-port* "   You do not own .megatest/main.db in this area.  Cannot proceed with megatest version migration.")
               (exit 1))
              (read-only
               (debug:print 0 *default-log-port* "   You have read-only access to this area.  Cannot proceed with megatest version migration.")
               (exit 1))
              (else
@@ -710,22 +722,21 @@
 	""))))
 
 (define (common:alist-ref/default key alist default)
   (or (alist-ref key alist) default))
 
-(define (common:low-noise-print waitval . keys)
-  (let* ((key      (string-intersperse (map conc keys) "-" ))
-	 (lasttime (hash-table-ref/default *common:denoise* key 0))
-	 (currtime (current-seconds)))
-    (if (> (- currtime lasttime) waitval)
-	(begin
-	  (hash-table-set! *common:denoise* key currtime)
-	  #t)
-	#f)))
-
-(define (common:get-megatest-exe)
-  (or (getenv "MT_MEGATEST") "megatest"))
+;; moved into commonmod
+;;
+;; (define (common:low-noise-print waitval . keys)
+;;   (let* ((key      (string-intersperse (map conc keys) "-" ))
+;; 	 (lasttime (hash-table-ref/default *common:denoise* key 0))
+;; 	 (currtime (current-seconds)))
+;;     (if (> (- currtime lasttime) waitval)
+;; 	(begin
+;; 	  (hash-table-set! *common:denoise* key currtime)
+;; 	  #t)
+;; 	#f)))
 
 (define (common:read-encoded-string instr)
   (handle-exceptions
    exn
    (handle-exceptions
@@ -946,20 +957,21 @@
 	  (handle-exceptions
 	      exn
 	      (begin
 		(debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn)
 		(exit 1))
-	      (let* ((tsname (common:get-testsuite-name))
+	      (let* ((toppath (common:real-path *toppath*))
+		     (tsname (common:get-testsuite-name))
 		     (dbpath (common:get-create-writeable-dir
 			      (list (conc "/tmp/" (current-user-name)
 					  "/megatest_localdb/"
 					  tsname "/"
-					  (string-translate *toppath* "/" "."))
+					  (string-translate toppath "/" "."))
 				    (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name
 					  "/megatest_localdb/"
 					  tsname
-					  (string-translate *toppath* "/" "."))
+					  (string-translate toppath "/" "."))
 				    ))))
 		(set! *db-cache-path* dbpath)
 		;; ensure megatest area has .megatest
 		(let ((dbarea (conc *toppath* "/.megatest")))
 		  (if (not (file-exists? dbarea))
@@ -972,23 +984,18 @@
 	  #f)))
 
 (define (common:get-area-path-signature)
   (message-digest-string (md5-primitive) *toppath*))
 
-(define (common:get-signature str)
-  (message-digest-string (md5-primitive) str))
-
 ;;======================================================================
 ;; E X I T   H A N D L I N G
 ;;======================================================================
 
 (define (common:run-sync?)
-    (and (common:on-homehost?)
-	 (args:get-arg "-server")))
-
-(define (common:human-time)
-  (time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S"))
+  (and *toppath*               ;; gate if called before *toppath* is set
+       (common:on-homehost?)
+       (args:get-arg "-server")))
 
 
 (define (std-signal-handler signum)
   ;; (signal-mask! signum)
   (set! *time-to-exit* #t) 
@@ -1048,26 +1055,10 @@
 (define (common:get-disks #!key (configf #f))
   (hash-table-ref/default 
    (or configf (read-config "megatest.config" #f #t))
    "disks" '("none" "")))
 
-;;======================================================================
-;; return first command that exists, else #f
-;;
-(define (common:which cmds)
-  (if (null? cmds)
-      #f
-      (let loop ((hed (car cmds))
-		 (tal (cdr cmds)))
-	(let ((res (with-input-from-pipe (conc "which " hed) read-line)))
-	  (if (and (string? res)
-		   (common:file-exists? res))
-	      res
-	      (if (null? tal)
-		  #f
-		  (loop (car tal)(cdr tal))))))))
-  
 (define (common:get-install-area)
   (let ((exe-path (car (argv))))
     (if (common:file-exists? exe-path)
 	(handle-exceptions
 	 exn
@@ -1345,11 +1336,11 @@
 			  (else
 			   (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.")
 			   #t)))) ;; default to requiring server
     (if force-result
 	(begin
-	  (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".")
+	  (debug:print-info 0 *default-log-port* "ATTENTION! Forcing use of server, force setting is \"" force-setting "\".")
 	  #t)
 	#f)))
 
 ;;======================================================================
 ;; M I S C   L I S T S
@@ -1598,10 +1589,30 @@
 	path) ;; just give up
     (with-input-from-pipe
 	(conc "/bin/readlink -f " path)
       (lambda ()
 	(read-line)))))
+
+;; for reasons I don't understand multiple calls to real-path in parallel threads
+;; must be protected by mutexes
+;;
+(define (common:real-path inpath)
+  ;; (process:cmd-run-with-stderr->list "readlink" "-f" inpath)) ;; cmd . params)
+  ;; (let-values 
+  ;;  (((inp oup pid) (process "readlink" (list "-f" inpath))))
+  ;;  (with-input-from-port inp
+  ;;    (let loop ((inl (read-line))
+  ;;       	(res #f))
+  ;;      (print "inl=" inl)
+  ;;      (if (eof-object? inl)
+  ;;          (begin
+  ;;            (close-input-port inp)
+  ;;            (close-output-port oup)
+  ;;            ;; (process-wait pid)
+  ;;            res)
+  ;;          (loop (read-line) inl))))))
+  (with-input-from-pipe (conc "readlink -f " inpath) read-line))
 
 ;;======================================================================
 ;; returns *effective load* (not normalized)
 ;;
 (define (common:get-intercept onemin fivemin)
@@ -1989,16 +2000,27 @@
 		    (host-last-used-set! rec curr-time)
 		    new-best)
 		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
 
 (define (common:wait-for-homehost-load maxnormload msg)
-  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
-                     #f
-                     (server:choose-server *toppath* 'homehost)))
-         (hh     (if hh-dat (car hh-dat) #f)))
-    (common:wait-for-normalized-load maxnormload msg hh)))
-
+  (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test...
+    (if (not *toppath*)
+	(begin
+	  (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.")
+	  (thread-sleep! 30)
+	  (if (< (- (current-seconds) start-time) 300)
+	      (loop start-time)))))
+  (case (rmt:transport-mode)
+    ((http)
+     (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
+			#f
+			(server:choose-server *toppath* 'homehost)))
+            (hh     (if hh-dat (car hh-dat) #f)))
+       (common:wait-for-normalized-load maxnormload msg hh)))
+    (else
+     (common:wait-for-normalized-load maxnormload msg (get-host-name)))))
+    
 (define (common:get-num-cpus remote-host)
   (let* ((actual-host (or remote-host (get-host-name))))
     ;; hosts had better not be changing the number of cpus too often!
     (or (hash-table-ref/default *numcpus-cache* actual-host #f)
 	(let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600)))
@@ -2218,30 +2240,10 @@
 	 (uname #f))
     (if (null? (car uname-res))
 	"unknown"
 	(caar uname-res))))
 
-;; for reasons I don't understand multiple calls to real-path in parallel threads
-;; must be protected by mutexes
-;;
-(define (common:real-path inpath)
-  ;; (process:cmd-run-with-stderr->list "readlink" "-f" inpath)) ;; cmd . params)
-  ;; (let-values 
-  ;;  (((inp oup pid) (process "readlink" (list "-f" inpath))))
-  ;;  (with-input-from-port inp
-  ;;    (let loop ((inl (read-line))
-  ;;       	(res #f))
-  ;;      (print "inl=" inl)
-  ;;      (if (eof-object? inl)
-  ;;          (begin
-  ;;            (close-input-port inp)
-  ;;            (close-output-port oup)
-  ;;            ;; (process-wait pid)
-  ;;            res)
-  ;;          (loop (read-line) inl))))))
-  (with-input-from-pipe (conc "readlink -f " inpath) read-line))
-
 ;;======================================================================
 ;; D I S K   S P A C E 
 ;;======================================================================
 
 (define (common:get-disk-space-used fpath)
@@ -2619,291 +2621,10 @@
     (cond
      (with-vars     (common:without-vars  fullcmd))
      (with-orig-env (common:with-orig-env fullcmd))
      (else          (common:without-vars  fullcmd "MT_.*")))))
 		  
-;;======================================================================
-;; T I M E   A N D   D A T E
-;;======================================================================
-
-;;======================================================================
-;; Convert strings like "5s 2h 3m" => 60x60x2 + 3x60 + 5
-(define (common:hms-string->seconds tstr)
-  (let ((parts     (string-split-fields "\\w+" tstr))
-	(time-secs 0)
-	;; s=seconds, m=minutes, h=hours, d=days, M=months, y=years, w=weeks
-	(trx       (regexp "(\\d+)([smhdMyw])")))
-    (for-each (lambda (part)
-		(let ((match  (string-match trx part)))
-		  (if match
-		      (let ((val (string->number (cadr match)))
-			    (unt (caddr match)))
-			(if val 
-			    (set! time-secs (+ time-secs (* val
-							    (case (string->symbol unt)
-							      ((s) 1)
-							      ((m) 60) ;; minutes
-							      ((h) 3600)
-							      ((d) 86400)
-							      ((w) 604800)
-							      ((M) 2628000) ;; aproximately one month
-							      ((y) 31536000)
-							      (else #f))))))))))
-	      parts)
-    time-secs))
-		       
-(define (seconds->hr-min-sec secs)
-  (let* ((hrs (quotient secs 3600))
-	 (min (quotient (- secs (* hrs 3600)) 60))
-	 (sec (- secs (* hrs 3600)(* min 60))))
-    (conc (if (> hrs 0)(conc hrs "hr ") "")
-	  (if (> min 0)(conc min "m ")  "")
-	  sec "s")))
-
-(define (seconds->time-string sec)
-  (time->string 
-   (seconds->local-time sec) "%H:%M:%S"))
-
-(define (seconds->work-week/day-time sec)
-  (time->string
-   (seconds->local-time sec) "ww%V.%u %H:%M"))
-
-(define (seconds->work-week/day sec)
-  (time->string
-   (seconds->local-time sec) "ww%V.%u"))
-
-(define (seconds->year-work-week/day sec)
-  (time->string
-   (seconds->local-time sec) "%yww%V.%w"))
-
-(define (seconds->year-work-week/day-time sec)
-  (time->string
-   (seconds->local-time sec) "%Yww%V.%w %H:%M"))
-
-(define (seconds->year-week/day-time sec)
-  (time->string
-   (seconds->local-time sec) "%Yw%V.%w %H:%M"))
-
-(define (seconds->quarter sec)
-  (case (string->number
-	 (time->string 
-	  (seconds->local-time sec)
-	  "%m"))
-    ((1 2 3) 1)
-    ((4 5 6) 2)
-    ((7 8 9) 3)
-    ((10 11 12) 4)
-    (else #f)))
-
-;;======================================================================
-;; basic ISO8601 format (e.g. "2017-02-28 06:02:54") date time => Unix epoch
-;;
-(define (common:date-time->seconds datetime)
-  (local-time->seconds (string->time datetime "%Y-%m-%d %H:%M:%S")))
-
-;;======================================================================
-;; given span of seconds tstart to tend
-;; find start time to mark and mark delta
-;;
-(define (common:find-start-mark-and-mark-delta tstart tend)
-  (let* ((deltat   (- (max tend (+ tend 10)) tstart)) ;; can't handle runs of less than 4 seconds. Pad it to 10 seconds ...
-	 (result   #f)
-	 (min      60)
-	 (hr       (* 60 60))
-	 (day      (* 24 hr))
-	 (yr       (* 365 day)) ;; year
-	 (mo       (/ yr 12))
-	 (wk       (* day 7)))
-    (for-each
-     (lambda (max-blks)
-       (for-each
-	(lambda (span) ;; 5 2 1
-	  (if (not result)
-	      (for-each 
-	       (lambda (timeunit timesym) ;; year month day hr min sec
-		 (if (not result)
-		     (let* ((time-blk (* span timeunit))
-			    (num-blks (quotient deltat time-blk)))
-		       (if (and (> num-blks 4)(< num-blks max-blks))
-			   (let ((first (* (quotient tstart time-blk) time-blk)))
-			     (set! result (list span timeunit time-blk first timesym))
-			     )))))
-	       (list yr mo wk day hr min 1)
-	       '(     y  mo w  d   h  m   s))))
-	(list 8 6 5 2 1)))
-     '(5 10 15 20 30 40 50 500))
-    (if values
-	(apply values result)
-	(values 0 day 1 0 'd))))
-
-;;======================================================================
-;; given x y lim return the cron expansion
-;;
-(define (common:expand-cron-slash x y lim)
-  (let loop ((curr x)
-	     (res  `()))
-    (if (< curr lim)
-	(loop (+ curr y) (cons curr res))
-	(reverse res))))
-
-;;======================================================================
-;; expand a complex cron string to a list of cron strings
-;;
-;;  x/y   => x, x+y, x+2y, x+3y while x+Ny<max_for_field
-;;  a,b,c => a, b ,c
-;;
-;;   NOTE: with flatten a lot of the crud below can be factored down.
-;;
-(define (common:cron-expand cron-str)
-  (if (list? cron-str)
-      (flatten
-       (fold (lambda (x res)
-	       (if (list? x)
-		   (let ((newres (map common:cron-expand x)))
-		     (append x newres))
-		   (cons x res)))
-	     '()
-	     cron-str)) ;; (map common:cron-expand cron-str))
-      (let ((cron-items (string-split cron-str))
-	    (slash-rx   (regexp "(\\d+)/(\\d+)"))
-	    (comma-rx   (regexp ".*,.*"))
-	    (max-vals   '((min        . 60)
-			  (hour       . 24)
-			  (dayofmonth . 28) ;;; BUG!!!! This will be a bug for some combinations
-			  (month      . 12)
-			  (dayofweek  . 7))))
-	(if (< (length cron-items) 5) ;; bad spec
-	    cron-str ;; `(,cron-str)              ;; just return the string, something downstream will fix it
-	    (let loop ((hed  (car cron-items))
-		       (tal  (cdr cron-items))
-		       (type 'min)
-		       (type-tal '(hour dayofmonth month dayofweek))
-		       (res  '()))
-	      (regex-case
-		  hed
-		(slash-rx ( _ base incr ) (let* ((basen          (string->number base))
-						 (incrn          (string->number incr))
-						 (expanded-vals  (common:expand-cron-slash basen incrn (alist-ref type max-vals)))
-						 (new-list-crons (fold (lambda (x myres)
-									 (cons (conc (if (null? res)
-											 ""
-											 (conc (string-intersperse res " ") " "))
-										     x " " (string-intersperse tal " "))
-									       myres))
-								       '() expanded-vals)))
-					    ;; (print "new-list-crons: " new-list-crons)
-					    ;; (fold (lambda (x res)
-					    ;; 	    (if (list? x)
-					    ;; 		(let ((newres (map common:cron-expand x)))
-					    ;; 		  (append x newres))
-					    ;; 		(cons x res)))
-					    ;; 	  '()
-					    (flatten (map common:cron-expand new-list-crons))))
-		;;					    (map common:cron-expand (map common:cron-expand new-list-crons))))
-		(else (if (null? tal)
-			  cron-str
-			  (loop (car tal)(cdr tal)(car type-tal)(cdr type-tal)(append res (list hed)))))))))))
-
-;;======================================================================
-;; given a cron string and the last time event was processed return #t to run or #f to not run
-;;
-;;  min    hour   dayofmonth month  dayofweek
-;; 0-59    0-23   1-31       1-12   0-6          ### NOTE: dayofweek does not include 7
-;;
-;;  #t => yes, run the job
-;;  #f => no, do not run the job
-;;
-(define (common:cron-event cron-str now-seconds-in last-done) ;; ref-seconds = #f is NOW.
-  (let* ((cron-items     (map string->number (string-split cron-str)))
-	 (now-seconds    (or now-seconds-in (current-seconds)))
-	 (now-time       (seconds->local-time now-seconds))
-	 (last-done-time (seconds->local-time last-done))
-	 (all-times      (make-hash-table)))
-    ;; (print "cron-items: " cron-items "(length cron-items): " (length cron-items))
-    (if (not (eq? (length cron-items) 5)) ;; don't even try to figure out junk strings
-	#f
-	(match-let (((     cmin chour cdayofmonth cmonth    cdayofweek)
-		     cron-items)
-		    ;; 0     1    2        3         4    5      6
-		    ((nsec nmin nhour ndayofmonth nmonth nyr ndayofweek n7 n8 n9)
-		     (vector->list now-time))
-		    ((lsec lmin lhour ldayofmonth lmonth lyr ldayofweek l7 l8 l9)
-		     (vector->list last-done-time)))
-	  ;; create all possible time slots
-	  ;; remove invalid slots due to (for example) day of week
-	  ;; get the start and end entries for the ref-seconds (current) time
-	  ;; if last-done > ref-seconds => this is an ERROR!
-	  ;; does the last-done time fall in the legit region?
-	  ;;    yes => #f  do not run again this command
-	  ;;    no  => #t  ok to run the command
-	  (for-each ;; month
-	   (lambda (month)
-	     (for-each ;; dayofmonth
-	      (lambda (dom)
-		(for-each
-		 (lambda (hr) ;; hour
-		   (for-each
-		    (lambda (minute) ;; minute
-		      (let ((copy-now (apply vector (vector->list now-time))))
-			(vector-set! copy-now 0 0) ;; force seconds to zero
-			(vector-set! copy-now 1 minute)
-			(vector-set! copy-now 2 hr)
-			(vector-set! copy-now 3 dom)  ;; dom is already corrected for zero referenced
-			(vector-set! copy-now 4 month)
-			(let* ((copy-now-secs (local-time->seconds copy-now))
-			       (new-copy      (seconds->local-time copy-now-secs))) ;; remake the time vector
-			  (if (or (not cdayofweek)
-				  (equal? (vector-ref new-copy 6)
-					  cdayofweek)) ;; if the day is specified and a match OR if the day is NOT specified
-			      (if (or (not cdayofmonth)
-				      (equal? (vector-ref new-copy 3)
-					      (+ 1 cdayofmonth))) ;; if the month is specified and a match OR if the month is NOT specified
-				  (hash-table-set! all-times copy-now-secs new-copy))))))
-		    (if cmin
-			`(,cmin)  ;; if given cmin, have to use it
-			(list (- nmin 1) nmin (+ nmin 1))))) ;; minute
-		 (if chour
-		     `(,chour)
-		     (list (- nhour 1) nhour (+ nhour 1))))) ;; hour
-	      (if cdayofmonth
-		  `(,cdayofmonth)
-		  (list (- ndayofmonth 1) ndayofmonth (+ ndayofmonth 1)))))
-	   (if cmonth
-	       `(,cmonth)
-	       (list (- nmonth 1) nmonth (+ nmonth 1))))
-	  (let ((before #f)
-		(is-in  #f))
-	    (for-each
-	     (lambda (moment)
-	       (if (and before
-			(<= before now-seconds)
-			(>= moment now-seconds))
-		   (begin
-		     ;; (print)
-		     ;; (print "Before: " (time->string (seconds->local-time before)))
-		     ;; (print "Now:    " (time->string (seconds->local-time now-seconds)))
-		     ;; (print "After:  " (time->string (seconds->local-time moment)))
-		     ;; (print "Last:   " (time->string (seconds->local-time last-done)))
-		     (if (<  last-done before)
-			 (set! is-in before))
-		     ))
-	       (set! before moment))
-	     (sort (hash-table-keys all-times) <))
-	    is-in)))))
-
-(define (common:extended-cron  cron-str now-seconds-in last-done)
-  (let ((expanded-cron (common:cron-expand cron-str)))
-    (if (string? expanded-cron)
-	(common:cron-event expanded-cron now-seconds-in last-done)
-	(let loop ((hed (car expanded-cron))
-		   (tal (cdr expanded-cron)))
-	  (if (common:cron-event hed now-seconds-in last-done)
-	      #t
-	      (if (null? tal)
-		  #f
-		  (loop (car tal)(cdr tal))))))))
-
 ;;======================================================================
 ;; C O L O R S
 ;;======================================================================
       
 (define (common:name->iup-color name)

Index: commonmod.scm
==================================================================
--- commonmod.scm
+++ commonmod.scm
@@ -17,21 +17,39 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
 (declare (unit commonmod))
+;; (declare (uses debugprint))
 
 (use srfi-69)
 
 (module commonmod
 	*
 
-(import scheme chicken data-structures extras files)
-(import (prefix sqlite3 sqlite3:)
-	posix typed-records srfi-18 srfi-69
-	md5 message-digest
-	regex srfi-1)
+(import scheme
+	chicken
+
+	(prefix sqlite3 sqlite3:)
+	data-structures
+	extras
+	files
+	matchable
+	md5
+	message-digest
+	pathname-expand
+	posix
+	posix-extras
+	regex
+	regex-case
+	srfi-1
+	srfi-18
+	srfi-69
+	typed-records
+
+	;; debugprint
+	)
 
 ;;======================================================================
 ;; CONTENTS
 ;;
 ;;  config file utils
@@ -136,10 +154,54 @@
 
 ;;======================================================================
 ;; misc conversion, data manipulation functions
 ;;======================================================================
 
+;;======================================================================
+;; return first command that exists, else #f
+;;
+(define (common:which cmds)
+  (if (null? cmds)
+      #f
+      (let loop ((hed (car cmds))
+		 (tal (cdr cmds)))
+	(let ((res (with-input-from-pipe (conc "which " hed) read-line)))
+	  (if (and (string? res)
+		   (file-exists? res))
+	      res
+	      (if (null? tal)
+		  #f
+		  (loop (car tal)(cdr tal))))))))
+  
+(define (common:get-megatest-exe)
+  (let* ((mtexe (or (get-environment-variable "MT_MEGATEST")
+		    (common:which '("megatest"))
+		    "megatest")))
+    (if (file-exists? mtexe)
+	(realpath mtexe)
+	mtexe)))
+
+(define (common:get-megatest-exe-dir)
+  (let* ((mtexe (common:get-megatest-exe)))
+    (pathname-directory mtexe)))
+
+;; more generic and comprehensive version of get-megatest-exe
+;;
+(define (common:get-mtexe)
+  (let* ((mtpathdir  (common:get-megatest-exe-dir)))
+    (or (common:get-megatest-exe)
+	(if mtpathdir
+	    (conc mtpathdir"/megatest")
+	    #f)
+	"megatest")))
+
+(define (common:get-megatest-exe-path)
+  (let* ((mtpathdir (common:get-megatest-exe-dir)))
+    (conc mtpathdir":"(get-environment-variable "PATH") ":.")))
+
+(define (realpath x) (resolve-pathname  (pathname-expand (or x "/dev/null")) ))
+
 ;; if it looks like a number -> convert it to a number, else return it
 ;;
 (define (lazy-convert inval)
   (let* ((as-num (if (string? inval)(string->number inval) #f)))
     (or as-num inval)))
@@ -161,10 +223,40 @@
 	     (filter (lambda (x)
 		       (not (string-match "^\\s*" x)))
 		     val-list))
 	'())))
 
+(define (get-cpu-load)
+  (let* ((load-info (with-input-from-file "/proc/loadavg" read-lines)))
+    (map string->number (string-split load-info))))
+
+(define *current-host-cores* #f)
+
+(define (get-current-host-cores)
+  (or *current-host-cores*
+      (let ((cpu-info (with-input-from-file "/proc/cpuinfo" read-lines)))
+	(let loop ((lines cpu-info))
+	  (if (null? lines)
+	      1 ;; gotta be at least one!
+	      (let* ((inl (car lines))
+		     (tail (cdr lines))
+		     (parts (string-split inl)))
+		(match parts
+		  (("cpu" "cores" ":" num) (string->number num))
+		  (else (loop tail)))))))))
+
+(define (number-of-processes-running processname)
+  (with-input-from-pipe
+   (conc "ps -def | egrep \""processname"\" |wc -l")
+   (lambda ()
+     (string->number (read-line)))))
+
+;; get the normalized (i.e. load / numcpus) for *this* host
+;;
+(define (get-normalized-cpu-load)
+  (/ (get-cpu-load)(get-current-host-cores)))
+
 ;;======================================================================
 ;; testsuite and area utilites
 ;;======================================================================
 
 (define (get-testsuite-name toppath configdat)
@@ -208,18 +300,344 @@
   (let ((adat (get-section cfgdat "areas")))
     (map (lambda (entry)
 	   `(,(car entry) . 
 	     ,(val->alist (cadr entry))))
 	 adat)))
+
+;;======================================================================
+;; time utils
+;;======================================================================
+
+(define (common:human-time)
+  (time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S"))
+
+;;======================================================================
+;; T I M E   A N D   D A T E
+;;======================================================================
+
+;;======================================================================
+;; Convert strings like "5s 2h 3m" => 60x60x2 + 3x60 + 5
+(define (common:hms-string->seconds tstr)
+  (let ((parts     (string-split-fields "\\w+" tstr))
+	(time-secs 0)
+	;; s=seconds, m=minutes, h=hours, d=days, M=months, y=years, w=weeks
+	(trx       (regexp "(\\d+)([smhdMyw])")))
+    (for-each (lambda (part)
+		(let ((match  (string-match trx part)))
+		  (if match
+		      (let ((val (string->number (cadr match)))
+			    (unt (caddr match)))
+			(if val 
+			    (set! time-secs (+ time-secs (* val
+							    (case (string->symbol unt)
+							      ((s) 1)
+							      ((m) 60) ;; minutes
+							      ((h) 3600)
+							      ((d) 86400)
+							      ((w) 604800)
+							      ((M) 2628000) ;; aproximately one month
+							      ((y) 31536000)
+							      (else #f))))))))))
+	      parts)
+    time-secs))
+		       
+(define (seconds->hr-min-sec secs)
+  (let* ((hrs (quotient secs 3600))
+	 (min (quotient (- secs (* hrs 3600)) 60))
+	 (sec (- secs (* hrs 3600)(* min 60))))
+    (conc (if (> hrs 0)(conc hrs "hr ") "")
+	  (if (> min 0)(conc min "m ")  "")
+	  sec "s")))
+
+(define (seconds->time-string sec)
+  (time->string 
+   (seconds->local-time sec) "%H:%M:%S"))
+
+(define (seconds->work-week/day-time sec)
+  (time->string
+   (seconds->local-time sec) "ww%V.%u %H:%M"))
+
+(define (seconds->work-week/day sec)
+  (time->string
+   (seconds->local-time sec) "ww%V.%u"))
+
+(define (seconds->year-work-week/day sec)
+  (time->string
+   (seconds->local-time sec) "%yww%V.%w"))
+
+(define (seconds->year-work-week/day-time sec)
+  (time->string
+   (seconds->local-time sec) "%Yww%V.%w %H:%M"))
+
+(define (seconds->year-week/day-time sec)
+  (time->string
+   (seconds->local-time sec) "%Yw%V.%w %H:%M"))
+
+(define (seconds->quarter sec)
+  (case (string->number
+	 (time->string 
+	  (seconds->local-time sec)
+	  "%m"))
+    ((1 2 3) 1)
+    ((4 5 6) 2)
+    ((7 8 9) 3)
+    ((10 11 12) 4)
+    (else #f)))
+
+;;======================================================================
+;; basic ISO8601 format (e.g. "2017-02-28 06:02:54") date time => Unix epoch
+;;
+(define (common:date-time->seconds datetime)
+  (local-time->seconds (string->time datetime "%Y-%m-%d %H:%M:%S")))
+
+;;======================================================================
+;; given span of seconds tstart to tend
+;; find start time to mark and mark delta
+;;
+(define (common:find-start-mark-and-mark-delta tstart tend)
+  (let* ((deltat   (- (max tend (+ tend 10)) tstart)) ;; can't handle runs of less than 4 seconds. Pad it to 10 seconds ...
+	 (result   #f)
+	 (min      60)
+	 (hr       (* 60 60))
+	 (day      (* 24 hr))
+	 (yr       (* 365 day)) ;; year
+	 (mo       (/ yr 12))
+	 (wk       (* day 7)))
+    (for-each
+     (lambda (max-blks)
+       (for-each
+	(lambda (span) ;; 5 2 1
+	  (if (not result)
+	      (for-each 
+	       (lambda (timeunit timesym) ;; year month day hr min sec
+		 (if (not result)
+		     (let* ((time-blk (* span timeunit))
+			    (num-blks (quotient deltat time-blk)))
+		       (if (and (> num-blks 4)(< num-blks max-blks))
+			   (let ((first (* (quotient tstart time-blk) time-blk)))
+			     (set! result (list span timeunit time-blk first timesym))
+			     )))))
+	       (list yr mo wk day hr min 1)
+	       '(     y  mo w  d   h  m   s))))
+	(list 8 6 5 2 1)))
+     '(5 10 15 20 30 40 50 500))
+    (if values
+	(apply values result)
+	(values 0 day 1 0 'd))))
+
+;;======================================================================
+;; given x y lim return the cron expansion
+;;
+(define (common:expand-cron-slash x y lim)
+  (let loop ((curr x)
+	     (res  `()))
+    (if (< curr lim)
+	(loop (+ curr y) (cons curr res))
+	(reverse res))))
+
+;;======================================================================
+;; expand a complex cron string to a list of cron strings
+;;
+;;  x/y   => x, x+y, x+2y, x+3y while x+Ny<max_for_field
+;;  a,b,c => a, b ,c
+;;
+;;   NOTE: with flatten a lot of the crud below can be factored down.
+;;
+(define (common:cron-expand cron-str)
+  (if (list? cron-str)
+      (flatten
+       (fold (lambda (x res)
+	       (if (list? x)
+		   (let ((newres (map common:cron-expand x)))
+		     (append x newres))
+		   (cons x res)))
+	     '()
+	     cron-str)) ;; (map common:cron-expand cron-str))
+      (let ((cron-items (string-split cron-str))
+	    (slash-rx   (regexp "(\\d+)/(\\d+)"))
+	    (comma-rx   (regexp ".*,.*"))
+	    (max-vals   '((min        . 60)
+			  (hour       . 24)
+			  (dayofmonth . 28) ;;; BUG!!!! This will be a bug for some combinations
+			  (month      . 12)
+			  (dayofweek  . 7))))
+	(if (< (length cron-items) 5) ;; bad spec
+	    cron-str ;; `(,cron-str)              ;; just return the string, something downstream will fix it
+	    (let loop ((hed  (car cron-items))
+		       (tal  (cdr cron-items))
+		       (type 'min)
+		       (type-tal '(hour dayofmonth month dayofweek))
+		       (res  '()))
+	      (regex-case
+		  hed
+		(slash-rx ( _ base incr ) (let* ((basen          (string->number base))
+						 (incrn          (string->number incr))
+						 (expanded-vals  (common:expand-cron-slash basen incrn (alist-ref type max-vals)))
+						 (new-list-crons (fold (lambda (x myres)
+									 (cons (conc (if (null? res)
+											 ""
+											 (conc (string-intersperse res " ") " "))
+										     x " " (string-intersperse tal " "))
+									       myres))
+								       '() expanded-vals)))
+					    ;; (print "new-list-crons: " new-list-crons)
+					    ;; (fold (lambda (x res)
+					    ;; 	    (if (list? x)
+					    ;; 		(let ((newres (map common:cron-expand x)))
+					    ;; 		  (append x newres))
+					    ;; 		(cons x res)))
+					    ;; 	  '()
+					    (flatten (map common:cron-expand new-list-crons))))
+		;;					    (map common:cron-expand (map common:cron-expand new-list-crons))))
+		(else (if (null? tal)
+			  cron-str
+			  (loop (car tal)(cdr tal)(car type-tal)(cdr type-tal)(append res (list hed)))))))))))
+
+;;======================================================================
+;; given a cron string and the last time event was processed return #t to run or #f to not run
+;;
+;;  min    hour   dayofmonth month  dayofweek
+;; 0-59    0-23   1-31       1-12   0-6          ### NOTE: dayofweek does not include 7
+;;
+;;  #t => yes, run the job
+;;  #f => no, do not run the job
+;;
+(define (common:cron-event cron-str now-seconds-in last-done) ;; ref-seconds = #f is NOW.
+  (let* ((cron-items     (map string->number (string-split cron-str)))
+	 (now-seconds    (or now-seconds-in (current-seconds)))
+	 (now-time       (seconds->local-time now-seconds))
+	 (last-done-time (seconds->local-time last-done))
+	 (all-times      (make-hash-table)))
+    ;; (print "cron-items: " cron-items "(length cron-items): " (length cron-items))
+    (if (not (eq? (length cron-items) 5)) ;; don't even try to figure out junk strings
+	#f
+	(match-let (((     cmin chour cdayofmonth cmonth    cdayofweek)
+		     cron-items)
+		    ;; 0     1    2        3         4    5      6
+		    ((nsec nmin nhour ndayofmonth nmonth nyr ndayofweek n7 n8 n9)
+		     (vector->list now-time))
+		    ((lsec lmin lhour ldayofmonth lmonth lyr ldayofweek l7 l8 l9)
+		     (vector->list last-done-time)))
+	  ;; create all possible time slots
+	  ;; remove invalid slots due to (for example) day of week
+	  ;; get the start and end entries for the ref-seconds (current) time
+	  ;; if last-done > ref-seconds => this is an ERROR!
+	  ;; does the last-done time fall in the legit region?
+	  ;;    yes => #f  do not run again this command
+	  ;;    no  => #t  ok to run the command
+	  (for-each ;; month
+	   (lambda (month)
+	     (for-each ;; dayofmonth
+	      (lambda (dom)
+		(for-each
+		 (lambda (hr) ;; hour
+		   (for-each
+		    (lambda (minute) ;; minute
+		      (let ((copy-now (apply vector (vector->list now-time))))
+			(vector-set! copy-now 0 0) ;; force seconds to zero
+			(vector-set! copy-now 1 minute)
+			(vector-set! copy-now 2 hr)
+			(vector-set! copy-now 3 dom)  ;; dom is already corrected for zero referenced
+			(vector-set! copy-now 4 month)
+			(let* ((copy-now-secs (local-time->seconds copy-now))
+			       (new-copy      (seconds->local-time copy-now-secs))) ;; remake the time vector
+			  (if (or (not cdayofweek)
+				  (equal? (vector-ref new-copy 6)
+					  cdayofweek)) ;; if the day is specified and a match OR if the day is NOT specified
+			      (if (or (not cdayofmonth)
+				      (equal? (vector-ref new-copy 3)
+					      (+ 1 cdayofmonth))) ;; if the month is specified and a match OR if the month is NOT specified
+				  (hash-table-set! all-times copy-now-secs new-copy))))))
+		    (if cmin
+			`(,cmin)  ;; if given cmin, have to use it
+			(list (- nmin 1) nmin (+ nmin 1))))) ;; minute
+		 (if chour
+		     `(,chour)
+		     (list (- nhour 1) nhour (+ nhour 1))))) ;; hour
+	      (if cdayofmonth
+		  `(,cdayofmonth)
+		  (list (- ndayofmonth 1) ndayofmonth (+ ndayofmonth 1)))))
+	   (if cmonth
+	       `(,cmonth)
+	       (list (- nmonth 1) nmonth (+ nmonth 1))))
+	  (let ((before #f)
+		(is-in  #f))
+	    (for-each
+	     (lambda (moment)
+	       (if (and before
+			(<= before now-seconds)
+			(>= moment now-seconds))
+		   (begin
+		     ;; (print)
+		     ;; (print "Before: " (time->string (seconds->local-time before)))
+		     ;; (print "Now:    " (time->string (seconds->local-time now-seconds)))
+		     ;; (print "After:  " (time->string (seconds->local-time moment)))
+		     ;; (print "Last:   " (time->string (seconds->local-time last-done)))
+		     (if (<  last-done before)
+			 (set! is-in before))
+		     ))
+	       (set! before moment))
+	     (sort (hash-table-keys all-times) <))
+	    is-in)))))
+
+(define (common:extended-cron  cron-str now-seconds-in last-done)
+  (let ((expanded-cron (common:cron-expand cron-str)))
+    (if (string? expanded-cron)
+	(common:cron-event expanded-cron now-seconds-in last-done)
+	(let loop ((hed (car expanded-cron))
+		   (tal (cdr expanded-cron)))
+	  (if (common:cron-event hed now-seconds-in last-done)
+	      #t
+	      (if (null? tal)
+		  #f
+		  (loop (car tal)(cdr tal))))))))
+
+
 
 ;;======================================================================
 ;; misc stuff
 ;;======================================================================
 
-;; (define (debug:print . params) #f)
-;; (define (debug:print-info . params) #f)
-;; 
-;; (define (set-functions dbgp dbgpinfo)
-;;   (set! debug:print dbgp)
-;;   (set! debug:print-info dbgpinfo))
+(define (common:get-signature str)
+  (message-digest-string (md5-primitive) str))
+
+;;======================================================================
+;; hash of hashs
+;;======================================================================
+
+(define (db:hoh-set! dat key1 key2 val)
+  (let* ((subhash (hash-table-ref/default dat key1 #f)))
+    (if subhash
+	(hash-table-set! subhash key2 val)
+	(begin
+	  (hash-table-set! dat key1 (make-hash-table))
+	  (db:hoh-set! dat key1 key2 val)))))
+
+(define (db:hoh-get dat key1 key2)
+  (let* ((subhash (hash-table-ref/default dat key1 #f)))
+    (and subhash
+	 (hash-table-ref/default subhash key2 #f))))
+
+;;======================================================================
+;; when called from a wrapper I need sometimes to find the calling
+;; wrapper, this is for dashboard to find the correct megatest.
+;;
+(define (common:find-local-megatest #!optional (progname "megatest"))
+  (let ((res (filter file-exists?
+		     (map (lambda (updir)
+			    (let* ((lm  (car (argv)))
+				   (dir (pathname-directory lm))
+				   (exe (pathname-strip-directory lm)))
+			      (conc (if dir (conc dir "/") "")
+				    (case (string->symbol exe)
+				      ((dboard)    (conc updir progname))
+				      ((mtest)     (conc updir progname))
+				      ((dashboard) progname)
+				      (else exe)))))
+			  '("../../" "../")))))
+    (if (null? res)
+	(begin
+	  ;; (debug:print 0 *default-log-port* "Failed to find this executable! Using what can be found on the path")
+	  progname)
+	(car res))))
 
 )

Index: configf.scm
==================================================================
--- configf.scm
+++ configf.scm
@@ -25,10 +25,13 @@
 (use regex regex-case matchable) ;;  directory-utils)
 (declare (unit configf))
 (declare (uses process))
 (declare (uses env))
 (declare (uses keys))
+(declare (uses commonmod))
+(declare (uses commonmod.import))
+(import commonmod)
 
 (include "common_records.scm")
 
 ;; return list (path fullpath configname)
 (define (find-config configname #!key (toppath #f))
@@ -97,10 +100,12 @@
 (define configf:var-expand-regex (regexp "^(.*)#\\{(scheme|system|shell|getenv|get|runconfigs-get|rget|scm|sh|rp|gv|g|mtrah)\\s+([^\\}\\{]*)\\}(.*)"))
 
 (define (configf:system ht cmd)
   (system cmd)
   )
+
+(define configf:imports "(import commonmod)")
 
 (define (configf:process-line l ht allow-system #!key (linenum #f))
   (let loop ((res l))
     (if (string? res)
 	(let ((matchdat (string-search configf:var-expand-regex res)))
@@ -111,11 +116,11 @@
 		     (poststr (list-ref matchdat 4))
 		     (result  #f)
 		     (start-time (current-seconds))
 		     (cmdsym  (string->symbol cmdtype))
 		     (fullcmd (case cmdsym
-				((scheme scm) (conc "(lambda (ht)" cmd ")"))
+				((scheme scm) (conc "(lambda (ht)(begin " configf:imports cmd "))"))
 				((system)     (conc "(lambda (ht)(configf:system ht \"" cmd "\"))"))
 				((shell sh)   (conc "(lambda (ht)(string-translate (shell \""  cmd "\") \"\n\" \" \"))"))
 				((realpath rp)(conc "(lambda (ht)(common:nice-path \"" cmd "\"))"))
 				((getenv gv)  (conc "(lambda (ht)(get-environment-variable \"" cmd "\"))"))
 				((mtrah)      (conc "(lambda (ht)"

Index: dashboard-context-menu.scm
==================================================================
--- dashboard-context-menu.scm
+++ dashboard-context-menu.scm
@@ -32,10 +32,11 @@
 (use srfi-1 posix regex regex-case srfi-69)
 (use (prefix sqlite3 sqlite3:))
 
 (declare (unit dashboard-context-menu))
 (declare (uses common))
+(declare (uses commonmod))
 (declare (uses db))
 (declare (uses gutils))
 (declare (uses rmt))
 (declare (uses ezsteps))
 ;; (declare (uses sdb))
@@ -43,10 +44,12 @@
 (declare (uses subrun))
 
 (include "common_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
+
+(import commonmod)
 
 (define (dboard:launch-testpanel run-id test-id)
   (let* ((dboardexe (common:find-local-megatest "dashboard"))
          (cmd (conc dboardexe
                     " -test " run-id "," test-id

Index: dashboard-guimonitor.scm
==================================================================
--- dashboard-guimonitor.scm
+++ dashboard-guimonitor.scm
@@ -34,10 +34,12 @@
 (declare (unit dashboard-guimonitor))
 (declare (uses common))
 (declare (uses keys))
 (declare (uses db))
 (declare (uses tasks))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
 (include "task_records.scm")

Index: dashboard-tests.scm
==================================================================
--- dashboard-tests.scm
+++ dashboard-tests.scm
@@ -31,17 +31,20 @@
 (use srfi-1 posix regex regex-case srfi-69)
 (use (prefix sqlite3 sqlite3:))
 
 (declare (unit dashboard-tests))
 (declare (uses common))
+(declare (uses commonmod))
 (declare (uses db))
 (declare (uses gutils))
 (declare (uses rmt))
 (declare (uses ezsteps))
 ;; (declare (uses sdb))
 ;; (declare (uses filedb))
 (declare (uses subrun))
+
+(import commonmod)
 
 (include "common_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
 
@@ -459,12 +462,11 @@
 ;;======================================================================
 ;;
 ;;======================================================================
 (define (dashboard-tests:examine-test run-id test-id) ;; run-id run-key origtest)
   (let* ((db-path       (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
-	 (dbstruct      #f) ;; NOT ACTUALLY USED (db:setup)) ;; (make-dbr:dbstruct path:  (db:dbfile-path #f) ;; (configf:lookup *configdat* "setup" "linktree") 
-			    ;;		   local: #t))
+	 (dbstruct      #f) ;; NOT USED
 	 (testdat        (rmt:get-test-info-by-id run-id test-id)) ;; (db:get-test-info-by-id dbstruct run-id test-id))
 	 (db-mod-time   0) ;; (file-modification-time db-path))
 	 (last-update   0) ;; (current-seconds))
 	 (request-update #t))
     (if (not testdat)

ADDED   dashboard-transport-mode.scm.template
Index: dashboard-transport-mode.scm.template
==================================================================
--- /dev/null
+++ dashboard-transport-mode.scm.template
@@ -0,0 +1,3 @@
+;; 'http or 'tcp
+(rmt:transport-mode 'nfs)
+;; (rmt:transport-mode 'http)

Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -44,20 +44,31 @@
 (declare (uses dcommon))
 (declare (uses dashboard-context-menu))
 (declare (uses vg))
 (declare (uses subrun))
 (declare (uses mt))
-(declare (uses dbfile))        
+(declare (uses dbmod))
+;; (declare (uses dbmemmod))
+(declare (uses dbfile))
+(declare (uses commonmod))
+(declare (uses commonmod.import))
+(import commonmod)
+
+(import dbmod dbfile)
 
 (include "common_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
 (include "task_records.scm")
 (include "megatest-version.scm")
 (include "megatest-fossil-hash.scm")
 (include "vg_records.scm")
 
+;; set some parameters here - these need to be put in something that can be loaded from other
+;; executables such as dashboard and mtutil
+;;
+(include "dashboard-transport-mode.scm")
 (dbfile:db-init-proc db:initialize-main-db)
 
 (define help (conc 
 	      "Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version 
               " license GPL, Copyright (C) Matt Welland 2012-2017

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -22,10 +22,22 @@
 ;; Database access
 ;;======================================================================
 
 ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc
 
+(declare (unit db))
+(declare (uses common))
+(declare (uses debugprint))
+(declare (uses dbmod))
+(declare (uses dbfile))
+(declare (uses keys))
+(declare (uses ods))
+(declare (uses client))
+(declare (uses mt))
+(declare (uses commonmod))
+(import commonmod)
+
 (use (srfi 18)
      extras
      tcp
      stack
      (prefix sqlite3 sqlite3:)
@@ -44,28 +56,19 @@
      z3
      typed-records
      matchable
      files)
 
-(declare (unit db))
-(declare (uses common))
-(declare (uses dbmod))
-;; (declare (uses debugprint))
-(declare (uses dbfile))
-(declare (uses keys))
-(declare (uses ods))
-(declare (uses client))
-(declare (uses mt))
-
 (include "common_records.scm")
 (include "db_records.scm")
 (include "key_records.scm")
 (include "run_records.scm")
 
 (define *number-of-writes* 0)
 (define *number-non-write-queries* 0)
 
+(import debugprint)
 (import dbmod)
 (import dbfile)
 
 ;; record for keeping state,status and count for doing roll-ups in
 ;; iterated tests
@@ -74,10 +77,17 @@
   (state #f)
   (status #f)
   (count  0)) 
 
 
+(define (db:with-db dbstruct run-id r/w proc . params)
+  (case (rmt:transport-mode)
+    ((http)(dbfile:with-db dbstruct run-id r/w proc params))
+    ((tcp) (dbmod:with-db dbstruct run-id r/w proc params))
+    ((nfs) (dbmod:with-db dbstruct run-id r/w proc params))
+    (else (assert #f "FATAL: db:with-db called with non-existant transport mode"))))
+
 ;;======================================================================
 ;; hash of hashs
 ;;======================================================================
 
 
@@ -92,19 +102,10 @@
 (define (db:hoh-get dat key1 key2)
   (let* ((subhash (hash-table-ref/default dat key1 #f)))
     (and subhash
 	 (hash-table-ref/default subhash key2 #f))))
 
-(define (db:get-cache-stmth dbdat run-id db stmt)
-  (let* (;; (dbdat       (dbfile:get-dbdat dbstruct run-id))
-	 (stmt-cache  (dbr:dbdat-stmt-cache dbdat))
-	 (stmth       (db:hoh-get stmt-cache db stmt)))
-    (or stmth
-	(let* ((newstmth (sqlite3:prepare db stmt)))
-	  (db:hoh-set! stmt-cache db stmt newstmth)
-	  newstmth))))
-
 ;;======================================================================
 ;; SQLITE3 HELPERS
 ;;======================================================================
 
 (define (db:general-sqlite-error-dump exn stmt . params)
@@ -133,10 +134,83 @@
   (assert *toppath* "FATAL: db:setup called before launch:setup has been run.")
   (let* ((tmpdir (common:get-db-tmp-area)))
     (if (not *dbstruct-dbs*)
 	(dbfile:setup do-sync *toppath* tmpdir)
 	*dbstruct-dbs*)))
+
+;; moved from dbfile
+;;
+;; ADD run-id SUPPORT
+;;
+(define (db:create-all-triggers dbstruct)
+  (db:with-db
+   dbstruct #f #f
+   (lambda (dbdat db)
+     (db:create-triggers db))))
+
+(define (db:create-triggers db)
+    (for-each (lambda (key)
+              (sqlite3:execute db (cadr key)))
+          db:trigger-list))
+
+(define (db:drop-all-triggers dbstruct)
+  (db:with-db
+   dbstruct #f #f
+   (lambda (dbdat db)
+     (db:drop-triggers db))))
+
+(define (db:have-incompletes? dbstruct run-id ovr-deadtime)
+  (let* ((incompleted '())
+	 (oldlaunched '())
+	 (toplevels   '())
+	 ;; (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
+	 (deadtime  (or ovr-deadtime 72000))) ;; twenty hours
+    (db:with-db
+     dbstruct run-id #f
+     (lambda (dbdat db)
+       
+       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
+       ;;
+       ;; HOWEVER: this code in run:test seems to work fine
+       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
+       ;;                     (db:test-get-run_duration testdat)))
+       ;;                    600) 
+       ;; (db:delay-if-busy dbdat)
+       (sqlite3:for-each-row 
+        (lambda (test-id run-dir uname testname item-path)
+          (if (and (equal? uname "n/a")
+                   (equal? item-path "")) ;; this is a toplevel test
+              ;; what to do with toplevel? call rollup?
+              (begin
+                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels)))
+                ;; (print-info "Found old toplevel test in RUNNING state, test-id=" test-id))
+              (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
+        (db:get-cache-stmth dbdat db
+        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');")
+        run-id deadtime)
+
+       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
+       ;;
+       ;; (db:delay-if-busy dbdat)
+       (sqlite3:for-each-row
+        (lambda (test-id run-dir uname testname item-path)
+          (if (and (equal? uname "n/a")
+                   (equal? item-path "")) ;; this is a toplevel test
+              ;; what to do with toplevel? call rollup?
+              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+              (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
+        (db:get-cache-stmth dbdat db
+        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');")
+        run-id)
+       
+       ;; (print-info "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
+       (if (and (null? incompleted)
+                (null? oldlaunched)
+                (null? toplevels))
+           #f
+           #t)))))
+
 
 ;; looks up subdb and returns it, if not found then set up
 ;; and then return it.
 ;;
 #;(define (db:get-db dbstruct run-id)
@@ -359,13 +433,15 @@
 (define (db:cache-for-read-only source target #!key (use-last-update #f))
   (if (and (hash-table-ref/default *global-db-store* target #f)
 	   (>= (file-modification-time target)(file-modification-time source)))
       (hash-table-ref *global-db-store* target)
       (let* ((toppath   (launch:setup))
-	     (targ-db-last-mod (if (common:file-exists? target)
-				   (file-modification-time target)
-				   0))
+	     (targ-db-last-mod (db:get-sqlite3-mod-time target))
+;;	      (if (common:file-exists? target)
+;; BUG: This needs to include wal mode stuff .shm etc.
+;;				   (file-modification-time target)
+;;				   0))
 	     (cache-db  (or (hash-table-ref/default *global-db-store* target #f)
 			    (db:open-megatest-db path: target)))
 	     (source-db (db:open-megatest-db path: source))
 	     (curr-time (current-seconds))
 	     (res      '())
@@ -405,35 +481,54 @@
 ;; 			  use-last-update: #t)))
 ;; 	  (thread-start! th1)
 ;; 	  (apply proc cache-db params)
 ;; 	  ))))
 
-
-
-
+(define (db:get-sqlite3-mod-time fname)
+  (let* ((wal-file (conc fname "-wal"))
+	 (shm-file (conc fname "-shm"))
+	 (get-mtime (lambda (f)
+		      (if (and (file-exists? f)
+			       (file-read-access? f))
+			  (file-modification-time f)
+			  0))))
+    (max (get-mtime fname)
+	 (get-mtime wal-file)
+	 (get-mtime shm-file))))
+	 
 (define (db:all-db-sync dbstruct)
   (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db))
 	 (data-synced       0) ;; count of changed records
     (tmp-area       (common:get-db-tmp-area))
     (dbfiles        (glob (conc tmp-area"/.megatest/*.db")))
     (sync-durations (make-hash-table))
     (no-sync-db        (db:open-no-sync-db)))
     (for-each
-     (lambda (file)
+     (lambda (file) ;; tmp db file
        (debug:print-info 3 *default-log-port* "file: " file)
-       (let* ((fname       (conc (pathname-file file) ".db"))
-	      (fulln       (conc *toppath*"/.megatest/"fname))
-	      (time1       (if (file-exists? file)
-			       (file-modification-time file)
-			       (begin
-				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "file)
-			   1)))
-	      (time2       (if (file-exists? fulln)
-			       (file-modification-time fulln)
-			       (begin
-				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln)
-				 0)))
+       (let* ((fname       (conc (pathname-file file) ".db")) ;; fname is tmp db file
+              (wal-file (conc fname "-wal"))
+              (shm-file (conc fname "-shm"))
+	      (fulln       (conc *toppath*"/.megatest/"fname)) ;; fulln is nfs db name
+              (wal-time     (if (file-exists? wal-file)             
+			       (file-modification-time wal-file)
+                               0))
+              (shm-time     (if (file-exists? shm-file)             
+			       (file-modification-time shm-file)
+                               0))
+	      (time1        (db:get-sqlite3-mod-time file))
+;;	       (if (file-exists? file)              ;; time1 is the max itime of the tmp db, -wal and -shm files.
+;;			       (max (file-modification-time file) wal-time shm-time)
+;;			       (begin
+;;				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "file)
+;;			   1)))
+	      (time2       (db:get-sqlite3-mod-time fulln))
+;;	      (if (file-exists? fulln)             ;; time2 is nfs file time
+;;			       (file-modification-time fulln)
+;;			       (begin
+;;				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln)
+;;				 0)))
 	      (changed      (> (- time1 time2) (+ (random 5) 1)))  ;; it has been at some few seconds since last synced
 	      (changed10    (> (- time1 time2) 10)) ;; it has been at least ten seconds since sync'd
 	      (jfile-exists (file-exists? (conc file"-journal"))) ;; i.e. are we busy?
 	      (do-cp        (cond
 			     ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover
@@ -456,13 +551,30 @@
 	       (hash-table-set! sync-durations (conc fname".db")
 				(- (current-milliseconds) start-time)))
 	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
          )))
      dbfiles)
-    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)))
+    ;; WHY does the dbdat need to be added back?
+    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))
+    )
   #t)
 
+(define (db:kill-servers)
+  (let* ((servers (server:choose-server *toppath* 'all-valid))) ;; (server:get-list *toppath*))
+    (for-each
+     (lambda (server)
+       (handle-exceptions
+           exn
+         (begin 
+           (debug:print-info 0 *default-log-port*  "Unable to get host and/or port from " server ", exn=" exn)     
+           #f)
+	 (match-let (((mod-time host port start-time server-id pid) server))
+		    (if (and host pid)
+			(tasks:kill-server host pid)))))
+     servers)
+    (delete-file* (common:get-sync-lock-filepath))))
+
 ;; options:
 ;;
 ;;  'killservers  - kills all servers
 ;;  'dejunk       - removes junk records
 ;;  'adj-testids  - move test-ids into correct ranges
@@ -473,117 +585,97 @@
 ;;  run-ids: '(1 2 3 ...) or #f (for all)
 ;;
 (define (db:multi-db-sync dbstruct . options)
   (let* (;; (dbdat       (db:open-db dbstruct #f dbfile:db-init-proc))
 	 (data-synced 0) ;; count of changed records
-    (tmp-area       (common:get-db-tmp-area))
-    (old2new (member 'old2new options))
-    (dejunk (member 'dejunk options))
-    (killservers (member 'killservers options))
-    (servers (server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath*))
-    (src-area (if old2new *toppath* tmp-area))
-    (dest-area (if old2new tmp-area *toppath*))
-    (dbfiles        (if old2new (glob (conc *toppath* "/.megatest/*.db")) (glob (conc tmp-area "/.megatest/*.db"))))
-    (keys (db:get-keys dbstruct))
-    (sync-durations (make-hash-table)))
-
-
-    (if killservers
-      (begin
-       	  (for-each
-	   (lambda (server)
-             (handle-exceptions
-             exn
-             (begin 
-               (debug:print-info 0 *default-log-port*  "Unable to get host and/or port from " server ", exn=" exn)     
-               #f)
-	     (match-let (((mod-time host port start-time server-id pid) server))
-	       (if (and host pid)
-		   (tasks:kill-server host pid)))))
-	   servers)
-          (delete-file* (common:get-sync-lock-filepath))
-      )
-    )
-    (for-each
-     (lambda (srcfile)
-       (debug:print-info 3 *default-log-port* "file: " srcfile)
-       (let* ((fname (conc (pathname-file srcfile) ".db"))
-              (basename (pathname-file srcfile))
-              (run-id (if (string= basename "main") #f (string->number basename)))
-	      (destfile (conc dest-area "/.megatest/" fname))
-              (dest-directory  (conc dest-area "/.megatest/"))
-              (dummy (debug:print-info 2 *default-log-port* "destfile = " destfile))
-              (dummy2 (debug:print-info 2 *default-log-port* "dejunk = " dejunk))
-	      (time1 (file-modification-time srcfile))
-              (time2 (if (file-exists? destfile)
-                         (begin
-                            (debug:print-info 2 *default-log-port* "destfile " destfile " exists")
-			    (file-modification-time destfile)
-                         )
-			 (begin
-			   (debug:print-info 0 *default-log-port* "Sync - I do not see file " destfile)
-			   0)))
-	      (changed ( < (- time2 time1) 6.0)) ;; dest db not updated within last 6 seconds
-
-      (do-cp (cond
-		      ((not (file-exists? destfile)) ;; shouldn't happen, but this might recover
-		       (debug:print-info 2 *default-log-port* "File " destfile " not found. Copying "srcfile" to "destfile)
-                       (system (conc "/bin/mkdir -p " dest-directory))
-                       (system (conc "/bin/cp " srcfile " " destfile))
-		       #t)
-		      (changed ;; (and changed
-		       ;; (> (- (current-seconds) time1) 3)) ;; if file is changed and three seconds have passed.
-		       #t)
-		      ((and changed *time-to-exit*) ;; last sync
-		       #t)
-		      (else
-		       #f))))
-          (if (or dejunk do-cp)
-	     (let* (
-                    (start-time (current-milliseconds))
-
-                    (subdb (or (dbfile:get-subdb dbstruct run-id) (dbfile:init-subdb dbstruct run-id dbfile:db-init-proc)))
-                    (mtdb      (dbr:subdb-mtdbdat subdb))
-                    (tmpdb     (dbfile:open-db dbstruct run-id dbfile:db-init-proc))
-
-                    )
-	       (debug:print-info 2 *default-log-port* "delta syncing file: " srcfile ", time diff: " (- time1 time2) " seconds")
-
-               (if old2new
-                 (begin
-                   (if dejunk (db:clean-up run-id mtdb))
-		   (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) #f mtdb tmpdb)
-                 )
-                 (begin
-                   (if dejunk (db:clean-up run-id tmpdb))
-		   (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) #f tmpdb mtdb)
-                 )
-               )
-	       (hash-table-set! sync-durations (conc srcfile ".db") (- (current-milliseconds) start-time)))
-	     (debug:print-info 2 *default-log-port* "skipping delta sync. " srcfile " is up to date")
-          )
-       )
-     )
-     dbfiles
-    )
-    data-synced
-  )
-)
+	 (tmp-area       (common:get-db-tmp-area))
+	 (old2new (member 'old2new options))
+	 (dejunk (member 'dejunk options))
+	 (killservers (member 'killservers options))
+	 (src-area (if old2new *toppath* tmp-area))
+	 (dest-area (if old2new tmp-area *toppath*))
+	 (dbfiles        (if old2new (glob (conc *toppath* "/.megatest/*.db")) (glob (conc tmp-area "/.megatest/*.db"))))
+	 (keys (db:get-keys dbstruct))
+	 (sync-durations (make-hash-table)))
+
+    ;; kill servers
+    (if killservers (db:kill-servers))
+    
+    (if (not dbfiles)
+	(debug:print-error 0 *default-log-port* "no dbfiles found in " (conc *toppath* "/.megatest"))
+	(for-each
+	 (lambda (srcfile)
+	   (debug:print-info 3 *default-log-port* "file: " srcfile)
+	   (let* ((fname    (conc (pathname-file srcfile) ".db"))
+		  (basename (pathname-file srcfile))
+		  (run-id   (if (string= basename "main") #f (string->number basename)))
+		  (destfile (conc dest-area "/.megatest/" fname))
+		  (dest-directory  (conc dest-area "/.megatest/"))
+		  (time1    (file-modification-time srcfile))
+		  (time2    (if (file-exists? destfile)
+				(begin
+				  (debug:print-info 2 *default-log-port* "destfile " destfile " exists")
+				  (file-modification-time destfile))
+				(begin
+				  (debug:print-info 0 *default-log-port* "Sync - I do not see file " destfile)
+				  0)))
+		  (changed   ( < (- time2 time1) 6.0)) ;; dest db not updated within last 6 seconds
+
+		  (do-cp     (cond
+			      ((not (file-exists? destfile)) ;; shouldn't happen, but this might recover
+			       (debug:print-info 2 *default-log-port* "File " destfile " not found. Copying "srcfile" to "destfile)
+			       ;; TODO: Need to fix this for WAL mod. Can't just copy.
+			       (system (conc "/bin/mkdir -p " dest-directory))
+			       (system (conc "/bin/cp " srcfile " " destfile))
+			       #t)
+			      (changed ;; (and changed
+			       #t)
+			      ((and changed *time-to-exit*) ;; last sync
+			       #t)
+			      (else
+			       #f))))
+             (if (or dejunk do-cp)
+		 (let* ((start-time (current-milliseconds))
+			;; subdb is misnamed - should be dbdat (I think...)
+			(subdb    (dbfile:open-db dbstruct run-id dbfile:db-init-proc))
+			;;        (or (dbfile:get-subdb dbstruct run-id)
+			;;            (dbfile:init-subdb dbstruct run-id dbfile:db-init-proc)))
+			(mtdb      (dbr:subdb-mtdbdat subdb))
+			;;
+			;; BUG: -mrw- I think this next line is wrong. run-id should be the path to .megatest/<runid>.db
+			;; 
+			(tmpdb     (dbfile:open-db dbstruct run-id dbfile:db-init-proc)))
+		   
+		   (debug:print-info 2 *default-log-port* "delta syncing file: " srcfile ", time diff: " (- time1 time2) " seconds")
+		   (if old2new
+                       (begin
+			 (if dejunk (db:clean-up run-id mtdb))
+			 (db:sync-tables (db:sync-all-tables-list
+					  dbstruct
+					  (db:get-keys dbstruct))
+					 #f mtdb tmpdb))
+                       (begin
+			 (if dejunk (db:clean-up run-id tmpdb))
+			 (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) #f tmpdb mtdb)))
+		   (hash-table-set! sync-durations (conc srcfile ".db") (- (current-milliseconds) start-time)))
+		 (debug:print-info 2 *default-log-port* "skipping delta sync. " srcfile " is up to date"))))
+	 dbfiles))
+    data-synced))
 
 ;; Sync all changed db's
 ;;
 (define (db:tmp->megatest.db-sync dbstruct run-id last-update)
   (let* ((subdbs (hash-table-values (dbr:dbstruct-subdbs dbstruct)))
 	 (res    '()))
     (for-each
      (lambda (subdb)
-       (let* ((dbname (db:run-id->dbname run-id))
-	      (mtdb   (dbr:subdb-mtdb subdb))
+       (let* ((mtdb   (dbr:subdb-mtdb subdb))
 	      (tmpdb  (db:get-subdb dbstruct run-id))
 	      (refndb (dbr:subdb-refndb subdb))
 	      (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb)))
 	 ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb)
+	 ;; BUG: verify this is really needed
 	 (dbfile:add-dbdat dbstruct run-id tmpdb)
 	 (set! res (cons newres res))))
      subdbs)
     res))
 
@@ -840,19 +932,21 @@
                               test_id      INTEGER,
                               update_time  TIMESTAMP,
                               cpuload      INTEGER DEFAULT -1,
                               diskfree     INTEGER DEFAULT -1,
                               diskusage    INTGER DEFAULT -1,
-                              run_duration INTEGER DEFAULT 0);")
+                              run_duration INTEGER DEFAULT 0,
+ last_update INTEGER DEFAULT (strftime('%s','now')));")
 	(sqlite3:execute db "CREATE TABLE IF NOT EXISTS archives (
                               id           INTEGER PRIMARY KEY,
                               test_id      INTEGER,
                               state        TEXT DEFAULT 'new',
                               status       TEXT DEFAULT 'n/a',
                               archive_type TEXT DEFAULT 'bup',
                               du           INTEGER,
-                              archive_path TEXT);")))
+                              archive_path TEXT,
+ last_update INTEGER DEFAULT (strftime('%s','now')));")))
         (db:create-triggers db)    
      db)) ;; )
 
 ;;======================================================================
 ;; A R C H I V E S
@@ -886,10 +980,11 @@
 	  "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d
              INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id
              WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND
          last_df > ?;")
 	 dneeded))
+    ;; BUG: Verfify this is really needed
     (dbfile:add-dbdat dbstruct #f dbdat)
     blocks))
     
 ;; returns id of the record, register a disk allocated to archiving and record it's last known
 ;; available space
@@ -990,64 +1085,64 @@
 
 ;;======================================================================
 ;; M A I N T E N A N C E
 ;;======================================================================
 
-(define (db:have-incompletes? dbstruct run-id ovr-deadtime)
-  (let* ((incompleted '())
-	 (oldlaunched '())
-	 (toplevels   '())
-	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
-	 (deadtime     (if (and deadtime-str
-				(string->number deadtime-str))
-			   (string->number deadtime-str)
-			   72000))) ;; twenty hours
-    (db:with-db
-     dbstruct run-id #f
-     (lambda (dbdat db)
-       (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
-       
-       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
-       ;;
-       ;; HOWEVER: this code in run:test seems to work fine
-       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
-       ;;                     (db:test-get-run_duration testdat)))
-       ;;                    600) 
-       ;; (db:delay-if-busy dbdat)
-       (sqlite3:for-each-row 
-        (lambda (test-id run-dir uname testname item-path)
-          (if (and (equal? uname "n/a")
-                   (equal? item-path "")) ;; this is a toplevel test
-              ;; what to do with toplevel? call rollup?
-              (begin
-                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-              (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
-        db
-        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');"
-        run-id deadtime)
-
-       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
-       ;;
-       ;; (db:delay-if-busy dbdat)
-       (sqlite3:for-each-row
-        (lambda (test-id run-dir uname testname item-path)
-          (if (and (equal? uname "n/a")
-                   (equal? item-path "")) ;; this is a toplevel test
-              ;; what to do with toplevel? call rollup?
-              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-              (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
-        db
-        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
-        run-id)
-       
-       (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
-       (if (and (null? incompleted)
-                (null? oldlaunched)
-                (null? toplevels))
-           #f
-           #t)))))
+;; (define (db:have-incompletes? dbstruct run-id ovr-deadtime)
+;;   (let* ((incompleted '())
+;; 	 (oldlaunched '())
+;; 	 (toplevels   '())
+;; 	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
+;; 	 (deadtime     (if (and deadtime-str
+;; 				(string->number deadtime-str))
+;; 			   (string->number deadtime-str)
+;; 			   72000))) ;; twenty hours
+;;     (db:with-db
+;;      dbstruct run-id #f
+;;      (lambda (dbdat db)
+;;        (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
+;;        
+;;        ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
+;;        ;;
+;;        ;; HOWEVER: this code in run:test seems to work fine
+;;        ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
+;;        ;;                     (db:test-get-run_duration testdat)))
+;;        ;;                    600) 
+;;        ;; (db:delay-if-busy dbdat)
+;;        (sqlite3:for-each-row 
+;;         (lambda (test-id run-dir uname testname item-path)
+;;           (if (and (equal? uname "n/a")
+;;                    (equal? item-path "")) ;; this is a toplevel test
+;;               ;; what to do with toplevel? call rollup?
+;;               (begin
+;;                 (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+;;                 (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+;;               (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
+;;         (db:get-cache-stmth dbdat db
+;;         "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');")
+;;         run-id deadtime)
+;; 
+;;        ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
+;;        ;;
+;;        ;; (db:delay-if-busy dbdat)
+;;        (sqlite3:for-each-row
+;;         (lambda (test-id run-dir uname testname item-path)
+;;           (if (and (equal? uname "n/a")
+;;                    (equal? item-path "")) ;; this is a toplevel test
+;;               ;; what to do with toplevel? call rollup?
+;;               (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+;;               (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
+;;         (db:get-cache-stmth dbdat db
+;;         "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');")
+;;         run-id)
+;;        
+;;        (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
+;;        (if (and (null? incompleted)
+;;                 (null? oldlaunched)
+;;                 (null? toplevels))
+;;            #f
+;;            #t)))))
 
 (define (db:get-status-from-final-status-file run-dir)
   (let ((infile (conc run-dir "/.final-status")))
     ;; first verify we are able to write the output file
     (if (not (file-read-access? infile))
@@ -1086,21 +1181,21 @@
 
     (db:with-db 
      dbstruct run-id #f
      (lambda (dbdat db)
        (let* ((stmth1 (db:get-cache-stmth
-		       dbdat run-id db
+		       dbdat db
 		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                            WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                           AND state IN ('RUNNING');"))
 	      (stmth2 (db:get-cache-stmth
-		       dbdat run-id db
+		       dbdat db
 		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                            WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                           AND state IN ('REMOTEHOSTSTART');"))
 	      (stmth3 (db:get-cache-stmth
-		       dbdat run-id db
+		       dbdat db
 		       "SELECT id,rundir,uname,testname,item_path FROM tests
                            WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                           AND state IN ('LAUNCHED');")))
 	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
 	 ;;
@@ -1365,45 +1460,53 @@
 ;; 	  (set! *last-global-delta-printed* *global-delta*)))
 
 (define (db:set-var dbstruct var val)
   (db:with-db dbstruct #f #t 
 	      (lambda (dbdat db)
-		(sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" var val))))
+		(sqlite3:execute  (db:get-cache-stmth dbdat db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);") var val))))
 
 (define (db:add-var dbstruct var val)
   (db:with-db dbstruct #f #t 
 	      (lambda (dbdat db)
-		(sqlite3:execute db "UPDATE metadat SET val=val+? WHERE var=?;" val var))))
+		(sqlite3:execute  (db:get-cache-stmth dbdat db "UPDATE metadat SET val=val+? WHERE var=?;") val var))))
 
 (define (db:del-var dbstruct var)
   (db:with-db dbstruct #f #t 
 	      (lambda (dbdat db)
-		(sqlite3:execute db "DELETE FROM metadat WHERE var=?;" var))))
+		(sqlite3:execute  (db:get-cache-stmth dbdat db "DELETE FROM metadat WHERE var=?;") var))))
 
 ;;======================================================================
 ;; no-sync.db - small bits of data to be shared between servers
 ;;======================================================================
 
-(define (db:no-sync-db db-in)
-  (if db-in
-      db-in
-      (if *no-sync-db*
-	  *no-sync-db*
-	  (begin
-	    (mutex-lock! *db-access-mutex*)
-	    (let ((dbpath (common:get-db-tmp-area))
-		  (db     (dbfile:open-no-sync-db dbpath)))
-	      (set! *no-sync-db* db)
-	      (mutex-unlock! *db-access-mutex*)
-	      db)))))
+(define (db:get-dbsync-path)
+  (case (rmt:transport-mode)
+    ((http)(common:get-db-tmp-area))
+    ((tcp) (conc *toppath*"/.megatest"))
+    ((nfs) (conc *toppath*"/.megatest"))
+    (else "/tmp/dunno-this-gonna-exist")))
+
+ ;; (define (db:no-sync-db db-in)
+ ;;   (if db-in
+ ;;       db-in
+ ;;       (if *no-sync-db*
+ ;; 	  *no-sync-db*
+ ;; 	  (begin
+ ;; 	    (mutex-lock! *db-access-mutex*)
+ ;; 	    (let ((dbpath (db:get-dbsync-path))
+ ;; 		  (db     (dbfile:open-no-sync-db dbpath)))
+ ;; 	      (assert (sqlite3:database? db) "FATAL: db:no-sync-db failed to open a database")
+ ;; 	      (set! *no-sync-db* db)
+ ;; 	      (mutex-unlock! *db-access-mutex*)
+ ;; 	      db)))))
 
 (define (with-no-sync-db proc)
-  (let* ((db  (db:no-sync-db *no-sync-db*)))
+  (let* ((db  (db:open-no-sync-db)))
     (proc db)))
 
 (define (db:open-no-sync-db)
-  (dbfile:open-no-sync-db (db:dbfile-path)))
+  (dbfile:open-no-sync-db (db:get-dbsync-path)))
 
 (define (db:no-sync-close-db db stmt-cache)
   (db:safely-close-sqlite3-db db stmt-cache))
 
 
@@ -1942,19 +2045,20 @@
     (debug:print-info 4 *default-log-port* "runs:get-runs-by-patt qry=" qry-str " " runnamepatt)
     ;(print "runs:get-runs-by-patt qry=" qry-str " " runnamepatt)
 
     (vector header 
             (reverse
-             (db:with-db dbstruct #f #f ;; reads db, does not write to it.
-                         (lambda (dbdat db)
-                           (sqlite3:fold-row
-                            (lambda (res . r)
-                              (cons (list->vector r) res))
-                            '()
-                            db
-                            qry-str
-                            runnamepatt)))))))
+             (db:with-db
+	      dbstruct #f #f ;; reads db, does not write to it.
+              (lambda (dbdat db)
+                (sqlite3:fold-row
+                 (lambda (res . r)
+                   (cons (list->vector r) res))
+                 '()
+                 db
+                 qry-str
+                 runnamepatt)))))))
 
 ;; use (get-value-by-header (db:get-header runinfo)(db:get-rows runinfo))
 ;; NOTE: Does NOT return a list of rows (or one row) for the first slot of the vector
 ;;       this is inconsistent with get-runs but it makes some sense.
 ;;
@@ -2029,28 +2133,32 @@
    (lambda (dbdat db)
        (if msg
          (sqlite3:execute db "UPDATE runs SET status=?,comment=? WHERE id=?;" status msg run-id)
          (sqlite3:execute db "UPDATE runs SET status=? WHERE id=?;" status run-id)))))
 
+(define (db:set-run-state-status-db dbdat db run-id state status )
+  (sqlite3:execute
+   (db:get-cache-stmth
+    dbdat db "UPDATE runs SET status=?,state=? WHERE id=?;") status state run-id))
+
 (define (db:set-run-state-status dbstruct run-id state status )
   (db:with-db
    dbstruct #f #f
    (lambda (dbdat db)
-          (sqlite3:execute db "UPDATE runs SET status=?,state=? WHERE id=?;" status state run-id))))
-
-
-
+     (db:set-run-state-status-db dbdat db run-id state status))))
+     
 (define (db:get-run-status dbstruct run-id)
   (let ((res "n/a"))
     (db:with-db
      dbstruct #f #f
      (lambda (dbdat db)
        (sqlite3:for-each-row 
 	(lambda (status)
 	  (set! res status))
-	db
-	"SELECT status FROM runs WHERE id=?;" 
+	(db:get-cache-stmth
+	 dbdat db
+	 "SELECT status FROM runs WHERE id=?;" )
 	run-id)
        res))))
 
 (define (db:get-run-state dbstruct run-id)
   (let ((res "n/a"))
@@ -2058,12 +2166,27 @@
      dbstruct #f #f
      (lambda (dbdat db)
        (sqlite3:for-each-row 
 	(lambda (status)
 	  (set! res status))
-	db
-	"SELECT state FROM runs WHERE id=?;" 
+	(db:get-cache-stmth
+	 dbdat db
+	 "SELECT state FROM runs WHERE id=?;" )
+	run-id)
+       res))))
+
+(define (db:get-run-state-status dbstruct run-id)
+  (let ((res (cons "n/a" "n/a")))
+    (db:with-db
+     dbstruct #f #f
+     (lambda (dbdat db)
+       (sqlite3:for-each-row 
+	(lambda (state status)
+	  (set! res (cons state status)))
+	(db:get-cache-stmth
+	 dbdat db
+	 "SELECT state,status FROM runs WHERE id=?;" )
 	run-id)
        res))))
 
 
 ;;======================================================================
@@ -2306,12 +2429,12 @@
 		  (sqlite3:for-each-row
 		   (lambda (run-id testname item-path state status)
 		     ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment
 		     (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-")))
 		   db 
-		   "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=?;" 
-		   test-id)))
+		   "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" 
+		   test-id run-id)))
     res))
 
 ;; get a useful subset of the tests data (used in dashboard
 ;; use db:mintest-get-{id ,run_id,testname ...}
 ;;
@@ -2402,25 +2525,28 @@
 ;;      NOTE: run-id is not used
 ;; ;;
 (define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
   (db:with-db
    dbstruct
-   run-id
-   #t
+   run-id #f
    (lambda (dbdat db)
-     (cond
-      ((and newstate newstatus newcomment)
-       (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
-			test-id))
-      ((and newstate newstatus)
-       (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id))
-      (else
-       (if newstate   (sqlite3:execute db "UPDATE tests SET state=?   WHERE id=?;" newstate   test-id))
-       (if newstatus  (sqlite3:execute db "UPDATE tests SET status=?  WHERE id=?;" newstatus  test-id))
-       (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment)
-				       test-id))))))
-  (mt:process-triggers dbstruct run-id test-id newstate newstatus))
+     (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment))))
+
+(define (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment)
+  (cond
+   ((and newstate newstatus newcomment)
+    (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
+		     test-id))
+   ((and newstate newstatus)
+    (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id))
+   (else
+    (if newstate   (sqlite3:execute db "UPDATE tests SET state=?   WHERE id=?;" newstate   test-id))
+    (if newstatus  (sqlite3:execute db "UPDATE tests SET status=?  WHERE id=?;" newstatus  test-id))
+    (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment)
+				    test-id))))
+  ;; (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NOTE: Moved into calling function
+  )
 
 ;; NEW BEHAVIOR: Count tests running in all runs!
 ;;
 (define (db:get-count-tests-running dbstruct run-id) ;; fastmode)
   (let* ((qry ;; (if fastmode
@@ -2429,11 +2555,11 @@
   (db:with-db
    dbstruct
    run-id
    #f
    (lambda (dbdat db)
-     (let* ((stmth (db:get-cache-stmth dbdat run-id db qry)))
+     (let* ((stmth (db:get-cache-stmth dbdat db qry)))
        (sqlite3:first-result stmth))))))
 
 ;; NEW BEHAVIOR: Count tests running in only one run!
 ;;
 (define (db:get-count-tests-actually-running dbstruct run-id)
@@ -2459,11 +2585,11 @@
     (db:with-db
      dbstruct
      run-id
      #f
      (lambda (dbdat db)
-       (let* ((stmth (db:get-cache-stmth dbdat run-id db qry)))
+       (let* ((stmth (db:get-cache-stmth dbdat db qry)))
 	 (sqlite3:first-result stmth run-id))))))
 
 ;; For a given testname how many items are running? Used to determine
 ;; probability for regenerating html
 ;;
@@ -2472,11 +2598,11 @@
    dbstruct
    run-id
    #f
    (lambda (dbdat db)
      (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")
-	    (stmth (db:get-cache-stmth dbdat run-id db stmt)))
+	    (stmth (db:get-cache-stmth dbdat db stmt)))
        (sqlite3:first-result
 	stmth run-id testname)))))
 
 (define (db:get-not-completed-cnt dbstruct run-id)
 (db:with-db
@@ -2672,11 +2798,11 @@
      (lambda (run-id)
        (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
 	 (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs)))
      run-ids)))
 
-;; Get test data using test_id, run-id is not used
+;; Get test data using test_id
 ;; 
 (define (db:get-test-info-by-id dbstruct run-id test-id)
   (db:with-db
    dbstruct
    run-id
@@ -2685,12 +2811,28 @@
      (let ((res #f))
        (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
 	(lambda (id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)
 	  ;;                0    1       2      3      4        5       6      7        8     9     10      11          12          13           14         15          16
 	  (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)))
-	db
-	(conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;")
+	(db:get-cache-stmth dbdat db
+			    (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;"))
+	test-id)
+       res))))
+
+;; Get test state, status using test_id
+;; 
+(define (db:get-test-state-status-by-id dbstruct run-id test-id)
+  (db:with-db
+   dbstruct
+   run-id
+   #f
+   (lambda (dbdat db)
+     (let ((res (cons #f #f)))
+       (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
+	(lambda (state status)
+	  (cons state status))
+	(db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;")
 	test-id)
        res))))
 
 ;; Use db:test-get* to access
 ;; Get test data using test_ids. NB// Only works within a single run!!
@@ -2715,18 +2857,21 @@
   (db:with-db
    dbstruct
    run-id
    #f
    (lambda (dbdat db)
-     (let ((res #f))
-       (sqlite3:for-each-row
-	(lambda (a . b)
-	  (set! res (apply vector a b)))
-	db
-	(conc "SELECT " db:test-record-qry-selector " FROM tests WHERE testname=? AND item_path=? AND run_id=?;")
-	test-name item-path run-id)
-       res))))
+     (db:get-test-info-db db run-id test-name item-path))))
+
+(define (db:get-test-info-db db run-id test-name item-path)
+  (let ((res #f))
+    (sqlite3:for-each-row
+     (lambda (a . b)
+       (set! res (apply vector a b)))
+     db
+     (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE testname=? AND item_path=? AND run_id=?;")
+     test-name item-path run-id)
+    res))
 
 (define (db:test-get-rundir-from-test-id dbstruct run-id test-id)
   (db:with-db
    dbstruct
    run-id
@@ -2843,11 +2988,11 @@
     (db:with-db
      dbstruct
      run-id
      #f
      (lambda (dbdat db)
-       (let* ((stmth (db:get-cache-stmth dbdat #f db stmt))
+       (let* ((stmth (db:get-cache-stmth dbdat db stmt))
 	      (res   (sqlite3:fold-row
 		      (lambda (res id test-id  category variable value expected tol units comment status type last-update)
 			(vector id test-id  category variable value expected tol units comment status type last-update))
 		      (vector #f #f #f #f #f #f #f #f #f #f #f #f)
 		      stmth
@@ -3177,11 +3322,13 @@
 			   test-name))
 	 (item-path    (db:test-get-item-path testdat))
          (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
          (tl-test-id   (if tl-testdat
 			   (db:test-get-id tl-testdat)
-			   #f)))
+			   #f))
+	 (new-state-eh #f)
+	 (new-status-eh #f))
     (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) 
 	(db:general-call dbstruct run-id 'set-test-start-time (list test-id)))
     (mutex-lock! *db-transaction-mutex*)
     (db:with-db
      dbstruct run-id #f
@@ -3189,29 +3336,32 @@
        (let ((tr-res
               (sqlite3:with-transaction
                db
                (lambda ()
                  ;; NB// Pass the db so it is part fo the transaction
-                 (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status
+                 (db:test-set-state-status-db db run-id test-id state status comment) ;; this call sets the item state/status
                  (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
-                     (let* ((state-status-counts  (db:get-all-state-status-counts-for-test db run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
-			    (state-statuses        (db:roll-up-rules state-status-counts state status))
-                          (newstate (car state-statuses))
-                          (newstatus (cadr state-statuses)))
+                     (let* ((state-status-counts (db:get-all-state-status-counts-for-test db run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
+			    (state-statuses      (db:roll-up-rules state-status-counts state status))
+                            (newstate            (car state-statuses))
+                            (newstatus           (cadr state-statuses)))
+		       (set! new-state-eh newstate)
+		       (set! new-status-eh newstatus)
                        (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
-							(apply conc
-                  (map (lambda (x)
-                     (conc
-                     		(with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
-                              state-status-counts))); end debug:print
-   
-                       (if tl-test-id
-			   (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
+				    (apply conc
+					   (map (lambda (x)
+						  (conc
+                     				   (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
+						state-status-counts))); end debug:print
+		       (if tl-test-id
+			   (db:test-set-state-status-db db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
 		       ))))))
          (mutex-unlock! *db-transaction-mutex*)
          (if (and test-id state status (equal? status "AUTO")) 
              (db:test-data-rollup dbstruct run-id test-id status))
+	 (if new-state-eh ;; moved from db:test-set-state-status
+	      (mt:process-triggers dbstruct run-id test-id new-state-eh new-status-eh))
          tr-res)))))
 
 (define (db:roll-up-rules state-status-counts state status)
   (let* ((running     (length (filter (lambda (x)
 					(member (dbr:counts-state x) *common:running-states*))
@@ -3275,56 +3425,54 @@
      (lambda (dbdat db)
        (let ((tr-res
               (sqlite3:with-transaction
                db
                (lambda ()
-                   (let* ((state-status-counts (db:get-all-state-status-counts-for-run db run-id))
+                   (let* ((state-status-counts (db:get-all-state-status-counts-for-run-db dbdat db run-id))
 			  (state-statuses      (db:roll-up-rules state-status-counts #f #f ))
                           (newstate            (car state-statuses))
                           (newstatus           (cadr state-statuses))) 
 		     (if (or (not (eq? newstate curr-state)) (not (eq?  newstatus curr-status)))
-			 (db:set-run-state-status db run-id newstate newstatus )))))))
+			 (db:set-run-state-status-db dbdat db run-id newstate newstatus )))))))
          (mutex-unlock! *db-transaction-mutex*)
          tr-res))))
+
+(define (db:get-all-state-status-counts-for-run-db dbdat db run-id)
+  (sqlite3:map-row
+   (lambda (state status count)
+     (make-dbr:counts state: state status: status count: count))
+   (db:get-cache-stmth
+    dbdat db
+    "SELECT state,status,count(id) FROM tests WHERE run_id=?  GROUP BY state,status;")
+   run-id ))
 
 (define (db:get-all-state-status-counts-for-run dbstruct run-id)
- (let* ((test-count-recs (db:with-db
-                          dbstruct #f #f
-                          (lambda (dbdat db)
-                            (sqlite3:map-row
-                             (lambda (state status count)
-                               (make-dbr:counts state: state status: status count: count))
-                             db
-                             "SELECT state,status,count(id) FROM tests WHERE run_id=?  GROUP BY state,status;"
-                             run-id )))))
-   test-count-recs))
-
+  (db:with-db
+   dbstruct #f #f
+   (lambda (dbdat db)
+     (db:get-all-state-status-counts-for-run-db dbdat db run-id))))
 
 ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status*
 ;;
 ;; NOTE: This is called within a transaction
 ;;
-(define (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path item-state-in item-status-in)
-  (let* ((test-info   (db:get-test-info dbstruct run-id test-name item-path))
+(define (db:get-all-state-status-counts-for-test db run-id test-name item-path item-state-in item-status-in)
+  (let* ((test-info   (db:get-test-info-db db run-id test-name item-path))
          (item-state  (or item-state-in (db:test-get-state test-info))) 
          (item-status (or item-status-in (db:test-get-status test-info)))
-         (other-items-count-recs (db:with-db
-                                  dbstruct run-id #f
-                                  (lambda (dbdat db)
-                                    (sqlite3:map-row
-                                     (lambda (state status count)
-                                       (make-dbr:counts state: state status: status count: count))
-                                     db
-                                     ;; ignore current item because we have changed its value in the current transation so this select will see the old value.
-                                     "SELECT state,status,count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND item_path !=? GROUP BY state,status;"
-                                     run-id test-name item-path))))
-
+         (other-items-count-recs (sqlite3:map-row
+                                  (lambda (state status count)
+                                    (make-dbr:counts state: state status: status count: count))
+                                  db
+                                  ;; ignore current item because we have changed its value in the current transation so this select will see the old value.
+                                  "SELECT state,status,count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND item_path !=? GROUP BY state,status;"
+                                  run-id test-name item-path))
          ;; add current item to tally outside of sql query
-         (match-countrec-lambda (lambda (countrec) 
-                                  (and (equal? (dbr:counts-state  countrec) item-state)
+	 (match-countrec-lambda (lambda (countrec) 
+				  (and (equal? (dbr:counts-state  countrec) item-state)
                                        (equal? (dbr:counts-status countrec) item-status))))
-
+	 
          (already-have-count-rec-list
           (filter match-countrec-lambda other-items-count-recs)) ;; will have either 0 or 1 count recs depending if another item shares this item's state/status
          
          (updated-count-rec    (if (null? already-have-count-rec-list)
                                    (make-dbr:counts state: item-state status: item-status count: 1)
@@ -3334,11 +3482,10 @@
 
          (nonmatch-countrec-lambda (lambda (countrec) (not (match-countrec-lambda countrec))))
          
          (unrelated-rec-list   
           (filter nonmatch-countrec-lambda other-items-count-recs)))
-    
     (cons updated-count-rec unrelated-rec-list)))
 
 ;; (define (db:get-all-item-states db run-id test-name)
 ;;   (sqlite3:map-row 
 ;;    (lambda (a) a)
@@ -4612,15 +4759,20 @@
 					(begin
 					  (sqlite3:interrupt! db)
 					  (sqlite3:finalize! db #t)
 					  ;; (vector-set! *task-db* 0 #f)
 					  (set! *task-db* #f)))))
+			      (if (and *no-sync-db*
+				       (sqlite3:database? *no-sync-db*))
+				  (sqlite3:finalize! *no-sync-db* #t))
 			      (if (and (not (args:get-arg "-server"))
-				       *runremote*)
+				       *runremote*
+				       (eq? (rmt:transport-mode) 'http))
 				  (begin
 				    (debug:print-info 0 *default-log-port* "Closing all client connections...")
-				    (http-client#close-all-connections!)))
+				    (http-transport:close-connections *runremote*)
+				    #;(http-client#close-all-connections!)))
                               ;; (if (and *runremote*
                               ;;          (remote-conndat *runremote*))
                               ;;     (begin
                               ;;       (http-client#close-all-connections!))) ;; for http-client
                               (if (not (eq? *default-log-port* (current-error-port)))

Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -37,13 +37,23 @@
 	stack
 	files
 	ports
 
 	commonmod
+	;; debugprint
 	)
 
-;; (import debugprint)
+(define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic
+(define num-run-dbs    (make-parameter 10))     ;; number of db's in .megatest
+(define dbfile:testsuite-name (make-parameter #f))
+(define dbfile:sync-method (make-parameter 'attach)) ;; 'attach or 'original
+
+;; 'original     - use old condition code
+;; 'suicide-mode - create mtrah/stop-the-train with info on what went wrong
+;; else use no condition code (should be production mode)
+;;
+(define no-condition-db-with-db (make-parameter 'suicide-mode))
 
 ;;======================================================================
 ;;  R E C O R D S
 ;;======================================================================
 
@@ -54,10 +64,19 @@
   (areapath  #f)
   (homehost  #f)
   (tmppath   #f)
   (read-only #f)
   (subdbs (make-hash-table))
+  ;;
+  ;; for the inmem approach (see dbmod.scm)
+  ;; this is one db per server
+  (inmem     #f)  ;; handle for the in memory copy
+  (dbfile    #f)  ;; path to the db file on disk
+  (ondiskdb  #f)  ;; handle for the on-disk file
+  (dbdat     #f)  ;; create a dbdat for the downstream calls such as db:with-db
+  (last-update 0)
+  (sync-proc #f)
   )
 
 ;; NOTE: Need one dbr:subdb per main.db, 1.db ...
 ;;
 (defstruct dbr:subdb
@@ -92,10 +111,11 @@
 (define *max-api-process-requests* 0)
 (define *api-process-request-count* 0)
 (define *db-write-access*     #t)
 (define *db-last-sync*        0)                 ;; last time the sync to megatest.db happened
 (define *db-multi-sync-mutex* (make-mutex))      ;; protect access to *db-sync-in-progress*, *db-last-sync*
+(define *db-last-access*      (current-seconds))
 
 (define (db:generic-error-printout exn . message)
   (print-call-chain (current-error-port))
   (apply dbfile:print-err message)
   (dbfile:print-err
@@ -157,51 +177,26 @@
           )
           #f
   )
 )
 
-;; ;; set up a single db (e.g. main.db, 1.db ... etc.)
-;; ;;
-;; (define (db:setup-db dbstruct areapath run-id)
-;;   (let* ((dbname   (db:run-id->dbname run-id))
-;; 	 (dbstruct (hash-table-ref/default dbstructs dbname #f)))
-;;     (if dbstruct
-;; 	dbstruct
-;; 	(let* ((dbstruct-new (make-dbr:dbstruct)))
-;; 	  (db:open-db dbstruct-new run-id areapath: areapath do-sync: #t)
-;; 	  (hash-table-set! dbstructs dbname dbstruct-new)
-;; 	  dbstruct-new))))
-    
-;; ; Returns the dbdat for a particular dbfile inside the area
-;; ;;
-;; (define (dbr:dbstruct-get-dbdat dbstruct dbfile)
-;;   (hash-table-ref/default (dbr:dbstruct-dbdats dbstruct) dbfile #f))
-;; 
-;; (define (dbr:dbstruct-dbdat-put! dbstruct dbfile db)
-;;   (hash-table-set! (dbr:dbstruct-dbdats dbstruct) dbfile db))
-;; 
-;; (define (db:run-id->first-num run-id)
-;;   (let* ((s (number->string run-id))
-;; 	 (l (string-length s)))
-;;     (substring s (- l 1) l)))
-
-;; 1234 => 4/1234.db
-;;   #f => 0/main.db
-;;   (abandoned the idea of num/db)
-;; 
 (define (dbfile:run-id->path apath run-id)
   (conc apath"/"(dbfile:run-id->dbname run-id)))
 
 (define (db:dbname->path apath dbname)
   (conc apath"/"dbname))
+
+(define (dbfile:run-id->dbnum run-id)
+  (cond
+   ((number? run-id)
+    (modulo run-id (num-run-dbs)))
+   ((not run-id) "main")   ;; 0 or main?
+   (else run-id)))
 
 ;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number
 (define (dbfile:run-id->dbname run-id)
-  (cond
-   ((number? run-id) (conc ".megatest/" (modulo run-id 100) ".db"))
-   ((not run-id)     (conc ".megatest/main.db"))
-   (else             run-id)))
+  (conc ".megatest/"(dbfile:run-id->dbnum run-id)".db"))
 
 ;; Make the dbstruct, setup up auxillary db's and call for main db at least once
 ;;
 ;; called in http-transport and replicated in rmt.scm for *local* access. 
 ;;
@@ -209,14 +204,12 @@
   (cond
    (*dbstruct-dbs*
     (dbfile:print-err "WARNING: dbfile:setup called when *dbstruct-dbs* is already initialized")
     *dbstruct-dbs*) ;; TODO: when multiple areas are supported, this optimization will be a hazard
    (else
-    (let* ((dbstruct (make-dbr:dbstruct)))
+    (let* ((dbstruct (make-dbr:dbstruct areapath: areapath tmppath: tmppath)))
       (set! *dbstruct-dbs* dbstruct)
-      (dbr:dbstruct-areapath-set! dbstruct areapath)
-      (dbr:dbstruct-tmppath-set!  dbstruct tmppath)
       dbstruct))))
 
 (define (dbfile:get-subdb dbstruct run-id)
   (let* ((dbfname (dbfile:run-id->dbname run-id)))
     (hash-table-ref/default (dbr:dbstruct-subdbs dbstruct) dbfname #f)))
@@ -224,14 +217,17 @@
 (define (dbfile:set-subdb dbstruct run-id subdb)
   (hash-table-set! (dbr:dbstruct-subdbs dbstruct) (dbfile:run-id->dbname run-id) subdb))
 
 ;; (define *dbfile:num-handles-in-use* 0)
 
-;; Get/open a database
+;; Get/open a database.
+;;
+;;    NOTE: most usage should call dbfile:open-db to get a dbdat
+;;
 ;;    if run-id => get run specific db
 ;;    if #f     => get main db
-;;    if run-id is a string treat it as a filename
+;;    if run-id is a string treat it as a filename - DON'T use this - we'll get rid of it.
 ;;    if db already open - return inmem
 ;;    if db not open, open inmem, rundb and sync then return inmem
 ;;    inuse gets set automatically for rundb's
 ;;
 (define (dbfile:get-dbdat dbstruct run-id)
@@ -241,12 +237,16 @@
 	(begin
 	  (stack-pop! (dbr:subdb-dbstack subdb))))))
 
 ;; return a previously opened db handle to the stack of available handles
 (define (dbfile:add-dbdat dbstruct run-id dbdat)
-  (let* ((subdb (dbfile:get-subdb dbstruct run-id)))
-    (stack-push! (dbr:subdb-dbstack subdb) dbdat)
+  (let* ((subdb (dbfile:get-subdb dbstruct run-id))
+	 (dbstk (dbr:subdb-dbstack subdb))
+	 (count (stack-count dbstk)))
+    (if (> count 15)
+	(dbfile:print-err "WARNING: stack for "run-id".db is "count"."))
+    (stack-push! dbstk dbdat)
     dbdat))
 
 ;; set up a subdb
 ;;
 (define (dbfile:init-subdb dbstruct run-id init-proc)
@@ -326,21 +326,21 @@
 (define (dbfile:print-err . params)
   (with-output-to-port
       (current-error-port)
     (lambda ()
       (apply print params))))
-    
+
 (define (dbfile:cautious-open-database fname init-proc sync-mode journal-mode #!optional (tries-left 500))
-  (let* ((busy-file  (conc fname"-journal"))
+  (let* ((busy-file  (conc fname "-journal"))
 	 (delay-time (* (- 51 tries-left) 1.1))
       	 (write-access (file-write-access? fname))
          (dir-access (file-write-access? (pathname-directory fname)))
          (retry      (lambda ()
 		       (thread-sleep! delay-time)
 		       (if (> tries-left 0)
 			   (dbfile:cautious-open-database fname init-proc
-							  sync-mode: sync-mode journal-mode: journal-mode
+							  sync-mode journal-mode
 							  (- tries-left 1))))))
     (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up."))
     
     (if (and (file-write-access? fname)
 	     (file-exists? busy-file))
@@ -351,11 +351,11 @@
 	  (thread-sleep! 1)
 	  (if (eq? tries-left 2)
 	      (begin
 	  	(dbfile:print-err "INFO: forcing journal rollup "busy-file)
 	  	(dbfile:brute-force-salvage-db fname)))
-	  (dbfile:cautious-open-database fname init-proc sync-mode: sync-mode journal-mode: journal-mode (- tries-left 1)))
+	  (dbfile:cautious-open-database fname init-proc sync-mode journal-mode (- tries-left 1)))
 	
 	(let* ((result (condition-case
 		        (if dir-access
 			    (dbfile:with-simple-file-lock
 			     (conc fname ".lock")
@@ -402,11 +402,10 @@
 		    "cp "backupfname" "fname)))
     (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n"
 		      "  "cmd)
     (system cmd)))
 
-
 (define (dbfile:open-no-sync-db dbpath)
   (if *no-sync-db*
       *no-sync-db*
       (begin
 	(if (not (file-exists? dbpath))
@@ -416,13 +415,20 @@
 	       (init-proc (lambda (db)
 			    (if (not db-exists)
 				(begin
 				  (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));"))
 				)))
-	       (db        (dbfile:cautious-open-database dbname init-proc 0 "WAL"))) ;; (sqlite3:open-database dbname)))
-	  ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
-	  ;; (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) ;; done in cautious-open-database
+	       (on-tmp      (equal? (car (string-split dbpath "/")) "tmp"))
+	       (db        (if on-tmp
+			      (dbfile:cautious-open-database dbname init-proc 0 "WAL")
+			      (dbfile:cautious-open-database dbname init-proc 0 #f)
+			      ;; (sqlite3:open-database dbname)
+			      )))
+	  (if on-tmp	      ;; done in cautious-open-database
+	      (begin
+		(sqlite3:execute db "PRAGMA synchronous = 0;")
+		(sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))))
 	  (set! *no-sync-db* db)
 	  db))))
 
 (define (db:no-sync-set db var val)
   (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val))
@@ -513,11 +519,12 @@
 	  (db:sync-touched dbstruct runid keys dbinit)
 	  (set! *db-sync-in-progress* #f)
 	  (delete-file* lock-file)
 	  #t)
         (begin
-          (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress.")
+	  (if (common:low-noise-print 120 (conc "no lock "from-db-file))
+	      (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress."))
 	  #f
 	  ))))
 
 ;; ;; Get a lock from the no-sync-db for the from-db, then delta sync the from-db to the to-db, otherwise return #f
 ;; ;;
@@ -556,11 +563,11 @@
          (tmpdb     (db:open-db dbstruct run-id dbinit)) ;; sqlite3-db tmpdbfile #f))
 	 (start-t   (current-seconds)))
     (mutex-lock! *db-multi-sync-mutex*)
     (let ((update_info (cons "last_update" (if force-sync 0 *db-last-sync*) )))
       (mutex-unlock! *db-multi-sync-mutex*)
-      (db:sync-tables (db:sync-all-tables-list dbstruct keys) update_info tmpdb mtdb))
+      (db:sync-tables (db:sync-all-tables-list keys) update_info tmpdb mtdb))
     (mutex-lock! *db-multi-sync-mutex*)
     (set! *db-last-sync* start-t)
     (set! *db-last-access* start-t)
     (mutex-unlock! *db-multi-sync-mutex*)
     (dbfile:add-dbdat dbstruct run-id tmpdb)
@@ -619,12 +626,12 @@
 	 '("type"           #f)
          '("last_update"    #f))))
 
 ;; needs db to get keys, this is for syncing all tables
 ;;
-(define (db:sync-main-list dbstruct keys)
-  (let ((keys  keys)) ;; (db:get-keys dbstruct)))
+(define (db:sync-main-list keys)
+  (let ((keys  keys))
     (list
      (list "keys"
 	   '("id"        #f)
 	   '("fieldname" #f)
 	   '("fieldtype" #f))
@@ -658,14 +665,29 @@
 	   '("reviewed"       #f)
 	   '("iterated"       #f)
 	   '("avg_runtime"    #f)
 	   '("avg_disk"       #f)
 	   '("tags"           #f)
-	   '("jobgroup"       #f)))))
+	   '("jobgroup"       #f))
+
+
+     (list "tasks_queue"
+           '("id"            #f)
+           '("action"        #f)
+           '("owner"         #f) 
+           '("state"         #f)
+           '("target"        #f)
+           '("name"          #f)
+           '("testpatt"      #f)
+           '("keylock"       #f)
+           '("params"        #f)
+           '("creation_time" #f)
+           '("execution_time" #f))
+     )))
 
-(define (db:sync-all-tables-list dbstruct keys)
-  (append (db:sync-main-list dbstruct keys)
+(define (db:sync-all-tables-list keys)
+  (append (db:sync-main-list keys)
 	  db:sync-tests-only))
 
 ;; tbls is ( ("tablename" ( "field1" [#f|proc1] ) ( "field2" [#f|proc2] ) .... ) )
 ;; db's are dbdat's
 ;;
@@ -870,11 +892,12 @@
           )
         )
 	tbls)
        (let* ((runtime      (- (current-milliseconds) start-time))
 	      (should-print (or ;; (debug:debug-mode 12)
-				(common:low-noise-print 120 "db sync" (> runtime 500))))) ;; low and high sync times treated as separate.
+			     (common:low-noise-print 120 "db sync")
+			     (> runtime 500)))) ;; low and high sync times treated as separate.
 	 (for-each 
 	  (lambda (dat)
 	    (let ((tblname (car dat))
 		  (count   (cdr dat)))
 	      (set! tot-count (+ tot-count count))
@@ -915,30 +938,10 @@
                              FOR EACH ROW
                                BEGIN 
                                  UPDATE test_data SET last_update=(strftime('%s','now'))
                                    WHERE id=old.id;
                                END;" )))
-;;
-;; ADD run-id SUPPORT
-;;
-(define (db:create-all-triggers dbstruct)
-  (db:with-db
-   dbstruct #f #f
-   (lambda (dbdat db)
-     (db:create-triggers db))))
-
-(define (db:create-triggers db)
-    (for-each (lambda (key)
-              (sqlite3:execute db (cadr key)))
-          db:trigger-list))
-
-(define (db:drop-all-triggers dbstruct)
-  (db:with-db
-   dbstruct #f #f
-   (lambda (dbdat db)
-     (db:drop-triggers db))))
-
 (define (db:is-trigger-dropped db tbl-name)
   (let* ((trigger-name (if (equal? tbl-name "test_steps")
 			   "update_teststeps_trigger" 
                            (conc "update_" tbl-name "_trigger")))
 	 (res          #f))
@@ -982,20 +985,81 @@
 ;; call with dbinit=db:initialize-main-db
 ;;
 (define (db:open-db dbstruct run-id dbinit)
   ;; (mutex-lock! *db-open-mutex*)
   (let* ((dbdat (dbfile:open-db dbstruct run-id dbinit)))
+              #;(case (rmt:transport-mode)
+		  ((http) (dbfile:open-db dbstruct run-id dbinit))
+		  ((tcp)  (dbmod:open-db  dbstruct run-id dbinit))
+		  (else (assert #f "FATAL: rmt:transport-node not correct value"(rmt:transport-mode))))
     (set! *db-write-access* (not (dbr:dbdat-read-only dbdat)))
     ;; (mutex-unlock! *db-open-mutex*)
     dbdat))
 
 (define dbfile:db-init-proc (make-parameter #f))
+
+;; in xmaxima this gives a curve close to what I want:
+;;    plot2d ((exp(x/1.2)-1)/300, [x, 0, 10])$
+;;    plot2d ((exp(x/1.5)-1)/40, [x, 0, 10])$
+;;    plot2d ((exp(x/5)-1)/40, [x, 0, 20])$
+(define (dbfile:droop x)
+  (/ (- (exp (/ x 5)) 1) 40))
+  ;; (* numqrys (/ 1 (qif-slope))))
+
+;; create a dropping near the db file in a qif dir
+;; use count of such files to gate queries (queries in flight)
+;;
+(define (dbfile:wait-for-qif fname run-id params)
+  (let* ((thedir  (pathname-directory fname))
+	 (dbnum   (dbfile:run-id->dbnum run-id))
+	 (destdir (conc thedir"/qif-"dbnum))
+	 (uniqn   (get-area-path-signature (conc dbnum params)))
+	 (crumbn  (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id))))
+    (if (not (file-exists? destdir))(create-directory (conc destdir"/attic") #t))
+    (let loop ((count 0))
+      (let* ((currlks (glob (conc destdir"/*")))
+	     (numqrys (length currlks))
+	     (delayval (cond ;; do a droopish curve
+			((> numqrys 25)
+			 (for-each
+			  (lambda (f)
+			    (if (> (- (current-seconds)
+				      (handle-exceptions
+					  exn
+					(current-seconds) ;; file is likely gone, just fake out
+					(file-modification-time f)))
+				   (keep-age-param))
+				(let* ((basedir (pathname-directory f))
+				       (filen   (pathname-file f))
+				       (destf   (conc basedir"/attic/"filen)))
+				  (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf)
+				  ;; (delete-file* f)
+				  (handle-exceptions
+				      exn
+				    #t
+				    (file-move f destf #t)))))
+			  currlks)
+			 4)
+			((> numqrys 0)  (dbfile:droop numqrys)) ;; slope of 1/100
+			(else #f))))
+	(if (and delayval
+		 (< count 5))
+	    (begin
+	      (thread-sleep! delayval)
+	      (loop (+ count 1))))))
+    (with-output-to-file crumbn
+      (lambda ()
+	(print fname" run-id="run-id" params="params)
+	))
+    crumbn))
 
 ;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;")
 ;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no
 ;;
-(define (db:with-db dbstruct run-id r/w proc . params)
+(define (dbfile:with-db dbstruct run-id r/w proc params)
+  (assert dbstruct "FATAL: db:with-db called with dbstruct "#f)
+  (assert (dbr:dbstruct? dbstruct) "FATAL: dbstruct is "dbstruct)
   (let* ((use-mutex (> *api-process-request-count* 25)) ;; risk of db corruption
 	 (have-struct (dbr:dbstruct? dbstruct))
          (dbdat     (if have-struct                ;; this stuff just allows us to call with a db handle directly
 			(db:open-db dbstruct run-id (dbfile:db-init-proc)) ;; (dbfile:get-subdb dbstruct run-id)
 			#f))
@@ -1004,42 +1068,61 @@
 			dbstruct))
 	 (fname     (if dbdat
 			(dbr:dbdat-dbfile dbdat)
 			"nofilenameavailable"))
 	 (jfile     (conc fname"-journal"))
-	 #;(subdb     (if have-struct
-			(dbfile:get-subdb dbstruct run-id)
-			#f))
-	 ) ;; was 25
+	 (qryproc   (lambda ()
+		      (if use-mutex (mutex-lock! *db-with-db-mutex*))
+		      (let ((res (apply proc dbdat db params))) ;; the actual call is here.
+			(if use-mutex (mutex-unlock! *db-with-db-mutex*))
+			;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
+			(if dbdat
+			    (dbfile:add-dbdat dbstruct run-id dbdat))
+			;; (delete-file* crumbfile)
+			res)))
+	 (stop-train (conc (dbr:dbstruct-areapath dbstruct)"/stop-the-train")))
+
+    (assert (sqlite3:database? db) "FATAL: db:with-db, db is not a database, db="db
+	    ", fname="fname)
     (if (file-exists? jfile)
 	(begin
 	  (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load")
 	  (thread-sleep! 0.2)))
     (if (and use-mutex
 	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
-	(dbfile:print-err *api-process-request-count* " parallel api requests being processed in process "
+	(dbfile:print-err *api-process-request-count*
+			  " parallel api requests being processed in process "
 			  (current-process-id))) ;;  ", throttling access"))
-    (condition-case
-	(begin
-	  (if use-mutex (mutex-lock! *db-with-db-mutex*))
-	  (let ((res (apply proc dbdat db params))) ;; the actual call is here.
-	    (if use-mutex (mutex-unlock! *db-with-db-mutex*))
-	    ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
-	    (if dbdat
-		(dbfile:add-dbdat dbstruct run-id dbdat))
-	    res))
-      (exn (io-error)
-	   (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))
-      (exn (corrupt)
-	   (db:generic-error-printout exn "ERROR: database " fname " is corrupt. Repair it to proceed."))
-      (exn (busy)
-	   (db:generic-error-printout exn "ERROR: database " fname
-				      " is locked. Try copying to another location, remove original and copy back."))
-      (exn (permission)(db:generic-error-printout exn "ERROR: database " fname " has some permissions problem."))
-      (exn ()
-	   (db:generic-error-printout exn "ERROR: Unknown error with database " fname " message: "
-				      ((condition-property-accessor 'exn 'message) exn))))))
+    (case (no-condition-db-with-db)
+      ((production)(qryproc))
+      ((suicide-mode)
+       (handle-exceptions
+	exn
+	(with-output-to-file stop-train
+	  (lambda ()
+	    (db:generic-error-printout exn "Stop train mode, run-id: "run-id
+				       " params: "params" proc: "proc)))
+	(qryproc)))
+      (else
+       (condition-case
+	(qryproc)
+	(exn (io-error)
+	     (db:generic-error-printout exn "ERROR: i/o error with "fname
+					". Check permissions, disk space etc. and try again."))
+	(exn (corrupt)
+	     (db:generic-error-printout exn "ERROR: database "fname
+					" is corrupt. Repair it to proceed."))
+	(exn (busy)
+	     (db:generic-error-printout exn "ERROR: database "fname
+					" is locked. Try copying to another location,"
+					" remove original and copy back."))
+	(exn (permission)(db:generic-error-printout exn "ERROR: database "fname
+						    " has some permissions problem."))
+	(exn ()
+	     (db:generic-error-printout exn "ERROR: Unknown error with database "fname
+					" message: "
+					((condition-property-accessor 'exn 'message) exn))))))))
 
 ;;======================================================================
 ;; another attempt at a transactionized queue
 ;;======================================================================
 
@@ -1166,6 +1249,19 @@
 	(let ((res (proc)))
 	  (dbfile:simple-file-release-lock fname)
 	  res)
 	(assert #t "FATAL: simple file lock never got a lock."))))
   
+(define (db:get-cache-stmth dbdat db stmt)
+  (let* (;; (dbdat       (dbfile:get-dbdat dbstruct run-id))
+	 (stmt-cache  (dbr:dbdat-stmt-cache dbdat))
+	 ;; (stmth       (db:hoh-get stmt-cache db stmt))
+	 (stmth       (hash-table-ref/default stmt-cache stmt #f)))
+    (or stmth
+	(let* ((newstmth (sqlite3:prepare db stmt)))
+	  ;; (db:hoh-set! stmt-cache db stmt newstmth)
+	  (hash-table-set! stmt-cache stmt newstmth)
+	  newstmth))))
+
+
+
 )

Index: dbmod.scm
==================================================================
--- dbmod.scm
+++ dbmod.scm
@@ -17,40 +17,396 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
 (declare (unit dbmod))
+(declare (uses dbfile))
+(declare (uses commonmod))
+(declare (uses debugprint))
 
 (module dbmod
 	*
 	
-(import scheme chicken data-structures extras)
-(import (prefix sqlite3 sqlite3:)
-	posix typed-records srfi-18
-	srfi-69)
-
-(define (db:run-id->dbname run-id)
-  (cond
-   ((number? run-id)(conc run-id ".db"))
-   ((not run-id)    "main.db")
-   (else            run-id)))
-
-
-;;======================================================================
-;; hash of hashs
-;;======================================================================
-
-
-(define (db:hoh-set! dat key1 key2 val)
-  (let* ((subhash (hash-table-ref/default dat key1 #f)))
-    (if subhash
-	(hash-table-set! subhash key2 val)
-	(begin
-	  (hash-table-set! dat key1 (make-hash-table))
-	  (db:hoh-set! dat key1 key2 val)))))
-
-(define (db:hoh-get dat key1 key2)
-  (let* ((subhash (hash-table-ref/default dat key1 #f)))
-    (and subhash
-	 (hash-table-ref/default subhash key2 #f))))
+(import scheme
+	chicken
+	data-structures
+	extras
+
+	(prefix sqlite3 sqlite3:)
+	posix
+	typed-records
+	srfi-1
+	srfi-18
+	srfi-69
+
+	commonmod
+	dbfile
+	debugprint
+	)
+
+;; NOTE: This returns only the name "1.db", "main.db", not the path
+;;
+(define (dbmod:run-id->dbfname run-id)
+  (conc (dbfile:run-id->dbnum run-id)".db"))
+
+(define (dbmod:get-dbdir dbstruct)
+  (let* ((areapath (dbr:dbstruct-areapath dbstruct))
+	 (dbdir    (conc areapath"/.megatest")))
+    (if (and (file-write-access? areapath)
+	     (not (file-exists? dbdir)))
+	(create-directory dbdir))
+    dbdir))
+
+(define (dbmod:run-id->full-dbfname dbstruct run-id)
+  (conc (dbmod:get-dbdir dbstruct run-id)"/"(dbmod:run-id->dbfname run-id)))
+
+;;======================================================================
+;; Read-only inmem cached direct from disk method
+;;======================================================================
+
+(define *dbmod:nfs-db-handles* (make-hash-table)) ;; dbfname -> dbstruct
+
+(define (dbmod:nfs-get-dbstruct run-id keys init-proc areapath)
+  (assert areapath "FATAL: dbmod:nfs-get-dbstruct called without areapath set.")
+  (let* ((dbfname  (dbmod:run-id->dbfname run-id))
+	 (dbstruct (hash-table-ref/default *dbmod:nfs-db-handles* dbfname #f)))
+    (if dbstruct
+	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
+	       (curr-secs   (current-seconds)))
+	  (if (> (- curr-secs last-update) 2)
+	      (begin
+		((dbr:dbstruct-sync-proc dbstruct) last-update)
+		(dbr:dbstruct-last-update-set! dbstruct curr-secs)))
+	  dbstruct)
+	(let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'fromdisk)))
+	  (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct)
+	  newdbstruct))))
+
+;;======================================================================
+;; The inmem one-db file per server method goes in here
+;;======================================================================
+
+(define (dbmod:with-db dbstruct run-id r/w proc params)
+  (let* ((dbdat  (dbmod:open-db dbstruct run-id (dbfile:db-init-proc)))
+	 (dbh    (dbr:dbdat-dbh dbdat))
+	 (dbfile (dbr:dbdat-dbfile dbdat)))
+    (apply proc dbdat dbh params)))
+
+(define (dbmod:open-inmem-db initproc)
+  (let* ((db      (sqlite3:open-database ":memory:"))
+	 (handler (sqlite3:make-busy-timeout 3600)))
+    (sqlite3:set-busy-handler! db handler)
+    (initproc db)
+    db))
+
+(define (dbmod:open-db dbstruct run-id dbinit)
+  (or (dbr:dbstruct-dbdat dbstruct)
+      (let* ((dbdat (make-dbr:dbdat
+		     dbfile: (dbr:dbstruct-dbfile dbstruct)
+		     dbh:    (dbr:dbstruct-inmem  dbstruct)
+		     )))
+	(dbr:dbstruct-dbdat-set! dbstruct dbdat)
+	dbdat)))
+
+;; Open the inmem db and the on-disk db
+;; populate the inmem db with data
+;;
+;; Updates fields in dbstruct
+;; Returns dbstruct
+;;
+;; * This routine creates the db if not found
+;; * Probably can get rid of the dbstruct-in
+;; 
+(define (dbmod:open-dbmoddb areapath run-id dbfname-in init-proc keys
+			    #!key (dbstruct-in #f)
+			    (syncdir 'todisk))
+  (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
+	 (dbfname      (or dbfname-in (dbmod:run-id->dbfname run-id)))
+	 (dbpath       (dbmod:get-dbdir dbstruct))             ;; directory where all the .db files are kept
+	 (dbfullname   (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
+	 (dbexists     (file-exists? dbfullname))
+	 (inmem        (dbmod:open-inmem-db init-proc))
+	 (write-access (file-write-access? dbpath))
+	 (db           (dbfile:with-simple-file-lock
+			(conc dbfullname".lock")
+			(lambda ()
+			  (let* ((db      (sqlite3:open-database dbfullname))
+				 (handler (sqlite3:make-busy-timeout 136000)))
+			    (sqlite3:set-busy-handler! db handler)
+			    (if write-access
+				(init-proc db))
+			    db))))
+	 (tables       (db:sync-all-tables-list keys)))
+    (dbr:dbstruct-inmem-set!    dbstruct inmem)
+    (dbr:dbstruct-ondiskdb-set! dbstruct db)
+    (dbr:dbstruct-dbfile-set!   dbstruct dbfullname)
+    (dbr:dbstruct-sync-proc-set! dbstruct
+				 (lambda (last-update)
+				   (sync-gasket tables last-update inmem db
+						dbfullname syncdir)))
+    ;; (dbmod:sync-tables tables #f db inmem)
+    (sync-gasket tables #f inmem db dbfullname 'fromdest)
+    (dbr:dbstruct-last-update-set! dbstruct (current-seconds)) ;; should this be offset back in time by one second?
+    dbstruct))
+
+;;    (if (eq? syncdir 'todisk) ;; sync to disk normally, sync from in dashboard
+;;        (dbmod:sync-tables tables last-update inmem db)
+;;        (dbmod:sync-tables tables last-update db inmem))))
+
+;; direction: 'fromdest 'todest
+;;
+(define (sync-gasket tables last-update inmem dbh dbfname direction)
+  (case (dbfile:sync-method)
+    ((attach)
+     (dbmod:attach-sync tables inmem dbfname direction))
+    (else
+     (case direction
+       ((todest)
+	(dbmod:sync-tables tables last-update inmem dbh))
+       (else
+	(dbmod:sync-tables tables last-update dbh inmem))))))
+
+
+(define (dbmod:close-db dbstruct)
+  ;; do final sync to disk file
+  ;; (do-sync ...)
+  (sqlite3:finalize! (dbr:dbstruct-ondiskdb dbstruct)))
+
+;;======================================================================
+;; Sync db
+;;======================================================================
+
+(define (dbmod:calc-use-last-update has-last-update fields last-update)
+  (cond
+   ((and has-last-update
+	 (member "last_update" fields))
+    #t) ;; if given a number, just use it for all fields
+   ((number? last-update) #f) ;; if not matched first entry then ignore last-update for this table
+   ((and (pair? last-update)
+	 (member (car last-update)    ;; last-update field name
+		 (map car fields)))
+    #t)
+   ((and last-update (not (pair? last-update)) (not (number? last-update)))
+    (debug:print 0 *default-log-port* "ERROR: parameter last-update for db:sync-tables must be a pair or a number, received: " last-update);; found in fields
+    #f)
+   (else
+    #f)))
+
+;; tbls is ( ("tablename" ( "field1" [#f|proc1] ) ( "field2" [#f|proc2] ) .... ) )
+;; dbs are sqlite3 db handles
+;;
+;; if last-update specified ("field-name" . time-in-seconds)
+;;    then sync only records where field-name >= time-in-seconds
+;;    IFF field-name exists
+;;
+;; Use (db:sync-all-tables-list keys) to get the tbls input
+;;
+(define (dbmod:sync-tables tbls last-update fromdb todb)
+  (let ((stmts       (make-hash-table)) ;; table-field => stmt
+	(all-stmts   '())              ;; ( ( stmt1 value1 ) ( stml2 value2 ))
+	(numrecs     (make-hash-table))
+	(start-time  (current-milliseconds))
+	(tot-count   0))
+    (for-each ;; table
+     (lambda (tabledat)
+       (let* ((tablename        (car tabledat))
+	      (fields           (cdr tabledat))
+	      (has-last-update  (member "last_update" fields))
+	      (use-last-update  (dbmod:calc-use-last-update has-last-update fields last-update))
+	      (last-update-value (if use-last-update ;; no need to check for has-last-update - it is already accounted for
+				     (if (number? last-update)
+					 last-update
+					 (cdr last-update))
+				     #f))
+	      (last-update-field (if use-last-update
+				     (if (number? last-update)
+					 "last_update"
+					 (car last-update))
+				     #f))
+	      (num-fields (length fields))
+	      (field->num (make-hash-table))
+	      (num->field (apply vector (map car fields))) ;; BBHERE
+	      (full-sel   (conc "SELECT " (string-intersperse (map car fields) ",") 
+				" FROM " tablename (if use-last-update ;; apply last-update criteria
+						       (conc " WHERE " last-update-field " >= " last-update-value)
+						       "")
+				";"))
+	      (full-ins   (conc "INSERT OR REPLACE INTO " tablename " ( " (string-intersperse (map car fields) ",") " ) "
+				" VALUES ( " (string-intersperse (make-list num-fields "?") ",") " );"))
+	      (fromdat    '())
+	      (fromdats   '())
+	      (totrecords 0)
+	      (batch-len  100) ;; (string->number (or (configf:lookup *configdat* "sync" "batchsize") "100")))
+	      (todat      (make-hash-table))
+	      (count      0)
+              (field-names (map car fields)))
+	 
+	 ;; set up the field->num table
+	 (for-each
+	  (lambda (field)
+	    (hash-table-set! field->num field count)
+	    (set! count (+ count 1)))
+	  fields)
+	 
+	 ;; read the source table
+         ;; store a list of all rows in the table in fromdat, up to batch-len.
+         ;; Then add fromdat to the fromdats list, clear fromdat and repeat.
+	 (sqlite3:for-each-row
+	  (lambda (a . b)
+	       (set! fromdat (cons (apply vector a b) fromdat))
+	       (if (> (length fromdat) batch-len)
+		   (begin
+		     (set! fromdats (cons fromdat fromdats))
+		     (set! fromdat  '())
+		     (set! totrecords (+ totrecords 1)))))
+	  fromdb
+	  full-sel)
+	 
+         ;; Count less than batch-len as a record
+         (if (> (length fromdat) 0)
+             (set! totrecords (+ totrecords 1)))
+	 
+	 ;; tack on remaining records in fromdat
+	 (if (not (null? fromdat))
+	     (set! fromdats (cons fromdat fromdats)))
+	 
+	 (sqlite3:for-each-row
+	  (lambda (a . b)
+	    (hash-table-set! todat a (apply vector a b)))
+	  todb
+	  full-sel)
+	 
+	 ;; first pass implementation, just insert all changed rows
+	 (let* ((db                 todb)
+                (drp-trigger        (if (member "last_update" field-names)
+					(db:drop-trigger db tablename) 
+					#f))
+		(has-last-update    (member "last_update" field-names))
+                (is-trigger-dropped (if has-last-update
+                                        (db:is-trigger-dropped db tablename)
+					#f)) 
+		(stmth  (sqlite3:prepare db full-ins))
+                (changed-rows 0))
+	   (for-each
+	    (lambda (fromdat-lst)
+	      (sqlite3:with-transaction
+	       db
+	       (lambda ()
+		 (for-each ;; 
+		  (lambda (fromrow)
+		    (let* ((a    (vector-ref fromrow 0))
+			   (curr (hash-table-ref/default todat a #f))
+			   (same #t))
+		      (let loop ((i 0))
+			(if (or (not curr)
+				(not (equal? (vector-ref fromrow i)(vector-ref curr i))))
+			    (set! same #f))
+			(if (and same
+				 (< i (- num-fields 1)))
+			    (loop (+ i 1))))
+		      (if (not same)
+			  (begin
+			    (apply sqlite3:execute stmth (vector->list fromrow))
+			    (hash-table-set! numrecs tablename (+ 1 (hash-table-ref/default numrecs tablename 0)))
+                            (set! changed-rows (+ changed-rows 1))))))
+		  fromdat-lst))))
+	    fromdats)
+	   
+	   (sqlite3:finalize! stmth)
+           (if (member "last_update" field-names)
+               (db:create-trigger db tablename)))))
+     tbls)
+    (let* ((runtime      (- (current-milliseconds) start-time))
+	   (should-print (or ;; (debug:debug-mode 12)
+			  (common:low-noise-print 120 "db sync")
+			  (> runtime 500)))) ;; low and high sync times treated as separate.
+      (for-each 
+       (lambda (dat)
+	 (let ((tblname (car dat))
+	       (count   (cdr dat)))
+	   (set! tot-count (+ tot-count count)))) 
+       (sort (hash-table->alist numrecs)(lambda (a b)(> (cdr a)(cdr b))))))
+    tot-count))
+
+(define (has-last-update dbh tablename)
+  (let* ((has-last #f))
+    (sqlite3:for-each-row
+     (lambda (name)
+       (if (equal? name "last_update")
+	   (set! has-last #t)))
+     dbh
+     (conc "SELECT name FROM pragma_table_info('"tablename"') as tblInfo;"))
+    has-last))
+
+;; tbls is ( ("tablename" ( "field1" [#f|proc1] ) ( "field2" [#f|proc2] ) .... ) )
+;;
+;; direction = fromdest, todest
+;; mode = 'full, 'incr
+;;
+(define (dbmod:attach-sync tables dbh destdbfile direction #!key
+			   (mode 'full)
+			   (no-update '("keys")) ;; do
+			   )
+  (debug:print 0 *default-log-port* "Doing sync "direction" "destdbfile)
+  (if (not (sqlite3:auto-committing? dbh))
+      (debug:print 0 *default-log-port* "Skipping sync due to transaction in flight.")
+      (let* ((table-names  (map car tables))
+	     (dest-exists  (file-exists? destdbfile)))
+	(assert dest-exists "FATAL: sync called with non-existant file, "destdbfile)
+	;; attach the destdbfile
+	;; for each table
+	;;    insert into dest.<table> select * from src.<table> where last_update>last_update
+	;; done
+	(debug:print 0 *default-log-port* "Attaching "destdbfile" as auxdb")
+	(sqlite3:execute dbh (conc "ATTACH '"destdbfile"' AS auxdb;"))
+	(for-each
+	 (lambda (table)
+	   (let* ((tbldat (alist-ref table tables equal?))
+		  (fields (map car tbldat))
+		  (fields-str (string-intersperse fields ","))
+		  (dir    (eq? direction 'todest))
+		  (fromdb (if dir "" "auxdb."))
+		  (todb   (if dir "auxdb." ""))
+		  (stmt1 (conc "INSERT OR IGNORE INTO "todb table
+			       " SELECT * FROM "fromdb table";"))
+		  (stmt2 (conc "INSERT OR REPLACE INTO "todb table
+			       " SELECT * FROM "fromdb table" WHERE "
+			       fromdb table".last_update > "
+			       todb table".last_update;"))
+		  (stmt3 (conc "INSERT OR REPLACE INTO "todb"."table
+			       " SELECT * FROM "fromdb table";"))
+		  (stmt4 (conc "DELETE FROM "todb table" WHERE "fromdb
+			       table ".last_update > "todb table".last_update;"))
+		  (stmt5 (conc "DELETE FROM "todb table";"))
+		  (stmt6 (conc "INSERT OR REPLACE INTO "todb table" ("fields-str") SELECT "fields-str" FROM "fromdb table";"))
+		  (start-ms (current-milliseconds)))
+	     ;; (if (not (has-last-update dbh table))
+	     ;;     (sqlite3:execute dbh (conc "ALTER TABLE "table" ADD COLUMN last_update INTEGER;")))
+	     ;; (if (not (has-last-update dbh (conc "auxdb."table)))
+	     ;;     (sqlite3:execute dbh (conc "ALTER TABLE auxdb."table" ADD COLUMN last_update INTEGER;")))
+	     (if (sqlite3:auto-committing? dbh)
+		 (begin
+		   (sqlite3:with-transaction
+		    dbh
+		    (lambda ()
+		      (sqlite3:execute dbh stmt5)
+		      ;; (sqlite3:execute dbh stmt4) ;; if it worked this would be better for incremental up
+		      ;; (sqlite3:execute dbh stmt1)
+		      (sqlite3:execute dbh stmt6)
+		      ))
+		   (debug:print 0 *default-log-port* "Synced table "table
+				" in "(- (current-milliseconds) start-ms)"ms"))
+		 (debug:print 0 *default-log-port* "Skipping sync of table "table" due to transaction in flight."))))
+	 table-names)
+	(sqlite3:execute dbh "DETACH auxdb;"))))
+
+;; prefix is "" or "auxdb."
+;;
+;; (define (dbmod:last-update-patch dbh prefix)
+;;   (let ((
+  
+;;======================================================================
+;; Moved from dbfile
+;;======================================================================
+
 
 )

Index: dcommon.scm
==================================================================
--- dcommon.scm
+++ dcommon.scm
@@ -27,10 +27,13 @@
 
 (declare (unit dcommon))
 
 (declare (uses gutils))
 (declare (uses db))
+(declare (uses commonmod))
+
+(import commonmod)
 ;; (declare (uses synchash))
 
 (include "megatest-version.scm")
 (include "common_records.scm")
 (include "db_records.scm")
@@ -635,11 +638,12 @@
 			       (common:max (map cadr col-indices))))
 	     (max-visible  (max (- (dboard:tabdat-num-tests tabdat) 15) 3))
 	     (max-col-vis  (if (> max-col 10) 10 max-col))
 	     (numrows      1)
 	     (numcols      1))
-	(iup:attribute-set! stats-matrix "CLEARVALUE" "CONTENTS")
+	(if (common:low-noise-print 60 "runs-stats-update-clear")
+	    (iup:attribute-set! stats-matrix "CLEARVALUE" "CONTENTS"))
 	(iup:attribute-set! stats-matrix "NUMCOL" max-col )
 	(iup:attribute-set! stats-matrix "NUMLIN" (if (< max-row max-visible) max-visible max-row)) ;; min of 20
 	(iup:attribute-set! stats-matrix "NUMCOL_VISIBLE" max-col-vis)
 	(iup:attribute-set! stats-matrix "NUMLIN_VISIBLE" (if (> max-row max-visible) max-visible max-row))
 	;;(print "row-indices: " row-indices " col-indices: " col-indices)
@@ -704,11 +708,13 @@
 				     #:numlin-visible 5
 				     ))
 	 (colnames       (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId"))
 	 (updater        (lambda ()
 			   (if (dashboard:monitor-changed? commondat tabdat)
-			       (let ((servers  (server:choose-server *toppath* 'all-valid))) ;; (server:get-list *toppath* limit: 10)))
+			       (let ((servers  (case (rmt:transport-mode)
+						 ((http)(server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath* limit: 10)))
+						 (else '()))))
 				 (iup:attribute-set! servers-matrix "NUMLIN" (length servers))
 				 ;; (set! colnum 0)
 				 ;; (for-each (lambda (colname)
 				 ;;    	 ;; (print "colnum: " colnum " colname: " colname)
 				 ;;    	 (iup:attribute-set! servers-matrix (conc "0:" colnum) colname)

Index: diff-report.scm
==================================================================
--- diff-report.scm
+++ diff-report.scm
@@ -17,10 +17,12 @@
 ;;
 
 (declare (unit diff-report))
 (declare (uses common))
 (declare (uses rmt))
+(declare (uses commonmod))
+(import commonmod)
          
 (include "common_records.scm")
 (use matchable)
 (use fmt)
 (use ducttape-lib)

ADDED   docs/reference/coping-with-the-tcp-time-wait-state-on-busy-linux-servers.pdf
Index: docs/reference/coping-with-the-tcp-time-wait-state-on-busy-linux-servers.pdf
==================================================================
--- /dev/null
+++ docs/reference/coping-with-the-tcp-time-wait-state-on-busy-linux-servers.pdf
cannot compute difference between binary files

ADDED   docs/reference/networking-increasing-the-max-number-of-tcp-ip-connections.pdf
Index: docs/reference/networking-increasing-the-max-number-of-tcp-ip-connections.pdf
==================================================================
--- /dev/null
+++ docs/reference/networking-increasing-the-max-number-of-tcp-ip-connections.pdf
cannot compute difference between binary files

ADDED   docs/reference/queues-dont-fix-overload.pdf
Index: docs/reference/queues-dont-fix-overload.pdf
==================================================================
--- /dev/null
+++ docs/reference/queues-dont-fix-overload.pdf
cannot compute difference between binary files

Index: ezsteps.scm
==================================================================
--- ezsteps.scm
+++ ezsteps.scm
@@ -27,10 +27,12 @@
 (declare (uses common))
 (declare (uses items))
 (declare (uses runconfig))
 ;; (declare (uses sdb))
 ;; (declare (uses filedb))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
@@ -38,11 +40,11 @@
 
 ;;(rmt:get-test-info-by-id run-id test-id) -> testdat
 
 ;; TODO: deprecate me in favor of ezsteps.scm
 ;;
-(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat)
+(define (ezsteps:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat)
   (let* ((stepname       (car ezstep))  ;; do stuff to run the step
 	 (stepinfo       (cadr ezstep))
 	;; (let ((info (cadr ezstep)))
 	;; 		   (if (proc? info) "" info)))
 	;; (stepproc       (let ((info (cadr ezstep)))
@@ -63,11 +65,12 @@
 	 (script         "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\
 	 (logpro-file    (conc stepname ".logpro"))
 	 (html-file      (conc stepname ".html"))
 	 (dat-file       (conc stepname ".dat"))
 	 (tconfig-logpro (configf:lookup testconfig "logpro" stepname))
-	 (logpro-used    (common:file-exists? logpro-file)))
+	 (logpro-used    (common:file-exists? logpro-file))
+	 (mtexepath      (common:get-megatest-exe-path)))
     (setenv "MT_STEP_NAME" stepname)
     (hash-table-set! all-steps-dat stepname `((params . ,paramparts)))
     (debug:print 0 *default-log-port* "stepparts: " stepparts ", stepparams: " stepparams
                  ", paramparts: " paramparts ", subrun: " subrun ", stepcmd: " stepcmd)
     
@@ -96,11 +99,11 @@
     
     (debug:print 4 *default-log-port* "script: " script)
     (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f)
     ;; now launch the actual process
     (call-with-environment-variables 
-     (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
+     (list (cons "PATH" mtexepath))
      (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1")
        (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 
 	      (pid #f))
 	 (let ((proc (lambda ()
 		       (set! pid (process-run "/bin/bash" (list "-c" cmd))))))

Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ http-transport.scm
@@ -153,11 +153,11 @@
 	(determine-proxy (constantly #f)))
     (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname)
     (handle-exceptions
 	exn
 	(begin
-	  (print-error-message exn)
+	  ;; (print-error-message exn)
 	  (if (< portnum 64000)
 	      (begin 
 		(debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...")
 		(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
 		(debug:print 5 *default-log-port* "exn=" (condition->list exn))
@@ -241,25 +241,17 @@
   (mutex-lock! *http-mutex*)
   (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*)))
 
 ;; Send "cmd" with json payload "params" to serverdat and receive result
 ;;
-(define (http-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3)(area-dat #f))
-  (let* ((fullurl    (if (vector? serverdat)
-			 (http-transport:server-dat-get-api-req serverdat)
-			 (begin
-			   (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info")
-			   (exit 1))))
+(define (http-transport:client-api-send-receive run-id runremote cmd params #!key (numretries 3))
+  (assert (remote? runremote) "FATAL: http-transport:client-api-send-receive called with serverdat="serverdat)
+  (let* ((fullurl    (remote-api-req runremote))
 	 (res        (vector #f "uninitialized"))
 	 (success    #t)
 	 (sparams    (db:obj->string params transport: 'http))
-	 (runremote  (or area-dat *runremote*))
-         (server-id   (if (vector? serverdat) 
-                           (http-transport:server-dat-get-server-id serverdat)
-                           (begin
-			     (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info")
-			     (exit 1)))))
+         (server-id  (remote-server-id runremote)))
        (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) 
 
        ;; set up the http-client here
        (max-retry-attempts 1)
        ;; consider all requests indempotent
@@ -286,21 +278,13 @@
                                                       (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".")
                                                       (debug:print 0 *default-log-port* " message: " msg ", exn=" exn)
                                                       (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey"))
                                                       (debug:print 0 *default-log-port* " call-chain: " call-chain)))
 						;; what if another thread is communicating ok? Can't happen due to mutex
-						(set! *runremote* #f)
-						(set! runremote #f)
-						;; (if runremote
-						;;    (remote-conndat-set! runremote #f))
-						;; Killing associated server to allow clean retry.")
-						;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
+						(http-transport:close-connections runremote)
 						(mutex-unlock! *http-mutex*)
-						;; (signal (make-composite-condition
-						;;          (make-property-condition 'commfail 'message "failed to connect to server")))
-						;; "communications failed"
-						(close-all-connections!)
+						;; (close-connection! fullurl)
 						(db:obj->string #f))
 					      (with-input-from-request ;; was dat
 					       fullurl 
 					       (list (cons 'key (or server-id   "thekey"))
 						     (cons 'cmd cmd)
@@ -345,67 +329,28 @@
 		       'timeout
 		       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))
 
 ;; careful closing of connections stored in *runremote*
 ;;
-(define (http-transport:close-connections #!key (area-dat #f))
-  (let* ((runremote  (or area-dat *runremote*))
-	 (server-dat (if runremote
-                         (remote-conndat runremote)
-                         #f))) ;; (hash-table-ref/default *runremote* run-id #f)))
-    (if (vector? server-dat)
-	(let ((api-dat (http-transport:server-dat-get-api-uri server-dat)))
-	  (handle-exceptions
+(define (http-transport:close-connections runremote)
+  (if (remote? runremote)
+      (let ((api-dat (remote-api-uri runremote)))
+	(handle-exceptions
 	    exn
-	    (begin
-	      (print-call-chain *default-log-port*)
-	      (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
-	    (close-connection! api-dat)
-            (close-idle-connections!)
-	    #t))
-	#f)))
-
-
-(define (make-http-transport:server-dat)(make-vector 6))
-(define (http-transport:server-dat-get-iface         vec)    (vector-ref  vec 0))
-(define (http-transport:server-dat-get-port          vec)    (vector-ref  vec 1))
-(define (http-transport:server-dat-get-api-uri       vec)    (vector-ref  vec 2))
-(define (http-transport:server-dat-get-api-url       vec)    (vector-ref  vec 3))
-(define (http-transport:server-dat-get-api-req       vec)    (vector-ref  vec 4))
-(define (http-transport:server-dat-get-last-access   vec)    (vector-ref  vec 5))
-;(define (http-transport:server-dat-get-socket        vec)    (vector-ref  vec 6))
-(define (http-transport:server-dat-get-server-id     vec)    (vector-ref  vec 6))
-
-(define (http-transport:server-dat-make-url vec)
-  (if (and (http-transport:server-dat-get-iface vec)
-	   (http-transport:server-dat-get-port  vec))
-      (conc "http://" 
-	    (http-transport:server-dat-get-iface vec)
-	    ":"
-	    (http-transport:server-dat-get-port  vec))
-      #f))
-
-(define (http-transport:server-dat-update-last-access vec)
-  (if (vector? vec)
-      (vector-set! vec 5 (current-seconds))
-      (begin
-	(print-call-chain (current-error-port))
-	(debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!"))))
-
-;;
-;; connect
-;;
-(define (http-transport:client-connect iface port server-id)
-  (debug:print-info 0 *default-log-port* "Connecting to client at "iface":"port", with server-id "server-id)
-  (let* ((api-url      (conc "http://" iface ":" port "/api"))
-	 (api-uri      (uri-reference (conc "http://" iface ":" port "/api")))
-	 (api-req      (make-request method: 'POST uri: api-uri))
-	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds) server-id)))
-    server-dat))
-
-
-
+	  (begin
+	    (print-call-chain *default-log-port*)
+	    (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
+	  (if (args:any-defined? "-server" "-execute" "-run")
+	      (debug:print-info 0 *default-log-port* "Closing connections to "api-dat))
+	  (if api-dat (close-connection! api-dat))
+
+	  ;; Would it be better to set *runremote* to #f? I don't think so. But we may
+	  ;; need to clear more of the runremote fields
+	  (remote-api-url-set! runremote #f) ;; used as a flag for connection up and running
+	  
+	  #t))
+      #f))
 
 ;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
 ;; used and to shutdown after sometime if it is not.
 ;;
 (define (http-transport:keep-running) 
@@ -429,11 +374,11 @@
                           (set! sdat *server-info*)
                           (mutex-unlock! *heartbeat-mutex*)
                           (if (and sdat
 				   (not changed)
 				   (> (- (current-seconds) start-time) 2))
-			      (let* ((servinfodir (conc *toppath*"/.servinfo"))
+			      (let* ((servinfodir (server:get-servinfo-dir *toppath*)) ;; (conc *toppath*"/.servinfo"))
 				     (ipaddr      (car sdat))
 				     (port        (cadr sdat))
 				     (servinf     (conc servinfodir"/"ipaddr":"port)))
 				(set! servinfofile servinf)
 				(if (not (file-exists? servinfodir))
@@ -448,34 +393,23 @@
 						       (lambda ()
 							 (delete-file* servinf))
 						       *on-exit-procs*))
 				;; put data about this server into a simple flat file host.port
 				(debug:print-info 0 *default-log-port* "Received server alive signature")
-                                #;(common:save-pkt `((action . alive)
-                                                   (T      . server)
-                                                   (pid    . ,(current-process-id))
-                                                   (ipaddr . ,(car sdat))
-                                                   (port   . ,(cadr sdat)))
-                                                 *configdat* #t)
 				sdat)
                               (begin
 				(debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
                                 (sleep 4)
 				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
-				    (let* ((ipaddr  (car sdat))
+                                    (if sdat 
+				      (let* ((ipaddr  (car sdat))
 					   (port    (cadr sdat))
-					   (servinf (conc *toppath*"/.servinfo/"ipaddr":"port)))
-				      (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
-				      ;; (delete-file* servinf) ;; handled by on-exit, can be removed
-                                      #;(common:save-pkt `((action . died)
-                                                         (T      . server)
-                                                         (pid    . ,(current-process-id))
-                                                         (ipaddr . ,(car sdat))
-                                                         (port   . ,(cadr sdat))
-                                                         (msg    . "Transport died?"))
-						       *configdat* #t)
+					   (servinf (conc (server:get-servinfo-dir *toppath*)"/"ipaddr":"port)))
+				        (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
 				      (exit))
+                                      (exit)
+                                    )
 				    (loop start-time
 					  (equal? sdat last-sdat)
 					  sdat)))))))
 	 (iface       (car server-info))
          (port        (cadr server-info))
@@ -504,11 +438,12 @@
 	  (if (and no-sync-db
 		   (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :)
               (begin
 		(if (common:low-noise-print 120 "sync-all-print")
                     (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S")))
-		(db:all-db-sync *dbstruct-dbs*))))
+		(db:all-db-sync *dbstruct-dbs*)
+		)))
       
       ;; when things go wrong we don't want to be doing the various queries too often
       ;; so we strive to run this stuff only every four seconds or so.
       (let* ((sync-time (- (current-milliseconds) start-time))
 	    (rem-time  (quotient (- 4000 sync-time) 1000)))
@@ -560,22 +495,22 @@
               (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
 	      (let ((curr-time (current-seconds)))
 		(handle-exceptions
 		    exn
 		    (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn)
-		    (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter
+		    (if (and ;; (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter
 			     (not *server-overloaded*)
 			     (file-exists? servinfofile))
 			(change-file-times servinfofile curr-time curr-time)))
-		(if (or (common:low-noise-print 120 "start new server")
+		(if (and (common:low-noise-print 120 "start new server")
 			(> *api-process-request-count* 50)) ;; if this server is kind of busy start up another
 		    (begin
-		      (debug:print-info 0 *default-log-port* "Server is busy, parallel-api-count "*api-process-request-count*", start another if possible...")
+		      (debug:print-info 0 *default-log-port* "Server is busy, api-count "*api-process-request-count*", start another if possible...")
 		      (server:kind-run *toppath*)
 		      (if (> *api-process-request-count* 100)
 			  (begin
-			    (debug:print-info 0 *default-log-port* "Server is overloaded at parallel-api-count="*api-process-request-count*", removing "servinfofile) 
+			    (debug:print-info 0 *default-log-port* "Server is overloaded at api-count=" *api-process-request-count*", removing "servinfofile) 
 			    (delete-file* servinfofile)))))))
           (loop 0 server-state bad-sync-count (current-milliseconds)))
          (else
           (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
           (http-transport:server-shutdown port)))))))
@@ -621,59 +556,36 @@
 ;; all routes though here end in exit ...
 ;;
 ;; start_server? 
 ;;
 (define (http-transport:launch)
-  ;; check that a server start is in progress, pause or exit if so
-  (let* ((tmp-area            (common:get-db-tmp-area))
-	 (server-start        (conc tmp-area "/.server-start"))
-	 (server-started      (conc tmp-area "/.server-started"))
-	 (start-time          (common:lazy-modification-time server-start))
-	 (started-time        (common:lazy-modification-time server-started))
-	 (server-starting     (< start-time started-time)) ;; if start-time is less than started-time then a server is still starting
-	 (start-time-old      (> (- (current-seconds) start-time) 5))
-         (cleanup-proc        (lambda (msg)
-                                (let* ((serv-fname      (conc "server-" (current-process-id) "-" (get-host-name) ".log"))
-                                       (full-serv-fname (conc *toppath* "/logs/" serv-fname))
-                                       (new-serv-fname  (conc *toppath* "/logs/" "defunct-" serv-fname)))
-                                  (debug:print 0 *default-log-port* msg)
-                                  (if (common:file-exists? full-serv-fname)
-                                      (system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname))
-                                      (debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname))
-                                  (exit)))))
-    #;(if (and (not start-time-old) ;; last server start try was less than five seconds ago
-	     (not server-starting))
-	(begin
-	  (cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting")
-	  (exit)))
-    ;; lets not even bother to start if there are already three or more server files ready to go
-    #;(let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
-      (if (> num-alive 3)
-          (begin
-            (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
-            (exit))))
-    #;(common:save-pkt `((action . start)
-		       (T      . server)
-		       (pid    . ,(current-process-id)))
-		     *configdat* #t)
-    (let* ((th2 (make-thread (lambda ()
-                               (debug:print-info 0 *default-log-port* "Server run thread started")
-                               (http-transport:run 
-                                (if (args:get-arg "-server")
-                                    (args:get-arg "-server")
-                                    "-")
-                                )) "Server run"))
-           (th3 (make-thread (lambda ()
-                               (debug:print-info 0 *default-log-port* "Server monitor thread started")
-                               (http-transport:keep-running)
-                               "Keep running"))))
-      (thread-start! th2)
-      (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
-      (thread-start! th3)
-      (set! *didsomething* #t)
-      (thread-join! th2)
-      (exit))))
+  ;; check the .servinfo directory, are there other servers running on this
+  ;; or another host?
+  (let* ((server-start-is-ok  (server:minimal-check *toppath*)))
+    (if (not server-start-is-ok)
+	(begin
+	  (debug:print 0 *default-log-port* "ERROR: server start not ok, exiting now.")
+	  (exit 1))))
+    
+  ;; check that a server start is in progress, pause or exit if so
+  (let* ((th2 (make-thread (lambda ()
+                             (debug:print-info 0 *default-log-port* "Server run thread started")
+                             (http-transport:run 
+                              (if (args:get-arg "-server")
+                                  (args:get-arg "-server")
+                                  "-")
+                              )) "Server run"))
+         (th3 (make-thread (lambda ()
+                             (debug:print-info 0 *default-log-port* "Server monitor thread started")
+                             (http-transport:keep-running)
+                             "Keep running"))))
+    (thread-start! th2)
+    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
+    (thread-start! th3)
+    (set! *didsomething* #t)
+    (thread-join! th2)
+    (exit)))
 
 ;; (define (http-transport:server-signal-handler signum)
 ;;   (signal-mask! signum)
 ;;   (handle-exceptions
 ;;    exn

Index: index-tree.scm
==================================================================
--- index-tree.scm
+++ index-tree.scm
@@ -29,10 +29,12 @@
 (declare (uses lock-queue))
 (declare (uses db))
 (declare (uses common))
 (declare (uses items))
 (declare (uses runconfig))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")

Index: items.scm
==================================================================
--- items.scm
+++ items.scm
@@ -21,10 +21,12 @@
 ;; 		     (temperature "cool medium hot")
 ;; 		     (season      "summer winter fall spring")))
 
 (declare (unit items))
 (declare (uses common))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 
 ;; Puts out all combinations
 (define (process-itemlist hierdepth curritemkey itemlist)

Index: keys.scm
==================================================================
--- keys.scm
+++ keys.scm
@@ -24,10 +24,12 @@
 (use sqlite3 srfi-1 posix regex regex-case srfi-69)
 (import (prefix sqlite3 sqlite3:))
 
 (declare (unit keys))
 (declare (uses common))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "key_records.scm")
 (include "common_records.scm")
 
 (define (keys->keystr keys) ;; => key1,key2,key3,additiona1, ...

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -29,18 +29,23 @@
 (import (prefix sqlite3 sqlite3:))
 
 (declare (unit launch))
 (declare (uses subrun))
 (declare (uses common))
+(declare (uses commonmod))
 (declare (uses configf))
 (declare (uses db))
 (declare (uses ezsteps))
+(declare (uses dbfile))
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "megatest-fossil-hash.scm")
+
+(import commonmod
+	dbfile)
 
 ;;======================================================================
 ;; ezsteps
 ;;======================================================================
 
@@ -183,11 +188,11 @@
 			     (tal    (cdr ezstepslst))
 			     (prevstep #f))
                     (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"")
 		    ;; check exit-info (vector-ref exit-info 1)
 		    (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1)
-			(let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat))
+			(let* ((logpro-used (ezsteps:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat))
 			       (stepname    (car ezstep))
 			       (stepparms   (hash-table-ref all-steps-dat stepname)))
 			  (setenv "MT_STEP_NAME" stepname)
 			  (pp (hash-table->alist all-steps-dat))
 			  ;; if logpro-used read in the stepname.dat file
@@ -205,11 +210,11 @@
                         )
 
                         ))))))
 
 (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
-  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
+  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "60")))
          (start-seconds (current-seconds))
 	 (calc-minutes  (lambda ()
 			  (inexact->exact 
 			   (round 
 			    (- 
@@ -239,13 +244,13 @@
                                        (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                   df
                                   #f)))
              (do-sync       (or new-cpu-load new-disk-free over-time))
 
-             (test-info   (rmt:get-test-info-by-id run-id test-id))
-             (state       (db:test-get-state test-info))
-             (status      (db:test-get-status test-info))
+             (test-info   (rmt:get-test-state-status-by-id run-id test-id))
+             (state       (car test-info));; (db:test-get-state test-info))
+             (status      (cdr test-info));; (db:test-get-status test-info))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
         ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
          ((test-get-kill-request run-id test-id)
@@ -259,11 +264,12 @@
           (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
           ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
           (set! kill-job? #f)))
 
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
-        (launch:handle-zombie-tests run-id)
+        (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
+	    (launch:handle-zombie-tests run-id))
         (when do-sync
           ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
           ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
           ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
           (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
@@ -314,11 +320,11 @@
 		      (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2)
 		      (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt
 		      )))
 	      (mutex-unlock! m)
 	      ;; no point in sticking around. Exit now. But run end of run before exiting?
-        (launch:end-of-run-check run-id)
+              (launch:end-of-run-check run-id)
 	      (exit)))
 	(if (hash-table-ref/default misc-flags 'keep-going #f)
 	    (begin
 	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
 	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
@@ -765,20 +771,28 @@
 ;; new
 ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
 ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
 ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
 ;; 0 RUNNING ==> this is actually the first condition, should not get here
-
+(define *last-rollup* 0)
 (define (launch:end-of-run-check run-id )
     (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
-           (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
+           (running-cnt       (rmt:get-count-tests-running-for-run-id run-id))
            (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
-           (current-state (rmt:get-run-state run-id))
-           (current-status (rmt:get-run-status run-id)))
-     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
-     (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      
-     (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
+	   (current-state-status (rmt:get-run-state-status run-id))
+           (current-state        (car current-state-status))  ;; (rmt:get-run-state run-id))
+           (current-status       (cdr current-state-status))) ;; (rmt:get-run-status run-id)))
+      ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
+      (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
+      ;;
+      ;; TODO: add a final rollup when run is done (if there isn't one already)
+      ;;
+      (if (or (< running-cnt 3)                              ;; have only few running
+	      (> (- (current-seconds) *last-rollup*) 10))    ;; or haven't rolled up in past ten seconds
+	  (begin
+	    (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
+	    (set! *last-rollup* (current-seconds))))
      (runs:update-junit-test-reporter-xml run-id) 
      (cond 
        ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
                 (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
                 (begin
@@ -1131,11 +1145,14 @@
 	      (setenv "MT_TESTSUITENAME" (common:get-testsuite-name)))
 	    (begin
 	      (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.")
 	      (set! *toppath* #f) ;; force it to be false so we return #f
 	      #f))
-	
+
+	;; needed by various transport and db modules
+	(dbfile:testsuite-name (common:get-testsuite-name)) ;; (get-testsuite-name *toppath* *configdat*))
+
         ;; one more attempt to cache the configs for future reading
         (let* ((cachefiles   (launch:get-cache-file-paths areapath toppath target mtconfig))
                (mtcachef     (car cachefiles))
                (rccachef     (cdr cachefiles)))
 
@@ -1567,11 +1584,11 @@
 					;; (list 'serverinf *server-info*)
 					#;(list 'homehost  (let* ((hhdat (server:get-homehost)))
 							   (if hhdat
 							       (car hhdat)
 							       #f)))
-					(list 'serverurl (if *runremote*
+					#;(list 'serverurl (if *runremote* ;; would like to add this back ... WORK NEEDED
 							     (remote-server-url *runremote*)
 							     #f)) ;;
 					(list 'areaname  (common:get-testsuite-name))
 					(list 'toppath   *toppath*)
 					(list 'work-area work-area)

Index: lock-queue.scm
==================================================================
--- lock-queue.scm
+++ lock-queue.scm
@@ -19,10 +19,12 @@
 (use (prefix sqlite3 sqlite3:) srfi-18)
 
 (declare (unit lock-queue))
 (declare (uses common))
 (declare (uses tasks))
+(declare (uses commonmod))
+(import commonmod)
 
 ;;======================================================================
 ;; attempt to prevent overlapping updates of rollup files by queueing
 ;; update requests in an sqlite db
 ;;======================================================================

Index: margs.scm
==================================================================
--- margs.scm
+++ margs.scm
@@ -23,10 +23,20 @@
 
 (define (args:get-arg arg . default)
   (if (null? default)
       (hash-table-ref/default args:arg-hash arg #f)
       (hash-table-ref/default args:arg-hash arg (car default))))
+
+;; get an arg as a number
+(define (args:get-arg-number arg . default)
+  (let* ((val-str (args:get-arg arg))
+	 (val     (if val-str (string->number val-str) #f)))
+    (if val
+	val
+	(if (null? default)
+	    #f
+	    default))))
 
 (define (args:any? . args)
   (not (null? (filter (lambda (x) x)
 		      (map args:get-arg args)))))
 

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 ;; (declare (unit megatest-version))
 
-(define megatest-version 1.7009)
+(define megatest-version 1.8007)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -23,10 +23,18 @@
 (define (toplevel-command . a) #f)
 
 (declare (uses common))
 ;; (declare (uses megatest-version))
 (declare (uses margs))
+(declare (uses mtargs))
+(declare (uses mtargs.import))
+(declare (uses commonmod))
+(declare (uses commonmod.import))
+(declare (uses mtargs))
+(declare (uses mtargs.import))
+(declare (uses debugprint))
+(declare (uses debugprint.import))
 (declare (uses runs))
 (declare (uses launch))
 (declare (uses server))
 (declare (uses client))
 (declare (uses tests))
@@ -41,27 +49,29 @@
 (declare (uses api))
 (declare (uses tasks)) ;; only used for debugging.
 (declare (uses env))
 (declare (uses diff-report))
 (declare (uses db))
+(declare (uses dbfile))
+(declare (uses dbfile.import))
 (declare (uses dbmod))
 (declare (uses dbmod.import))
-(declare (uses commonmod))
-(declare (uses commonmod.import))
-(declare (uses dbfile))
-(declare (uses dbfile.import))
+(declare (uses tcp-transportmod))
+(declare (uses tcp-transportmod.import))
 ;; (declare (uses debugprint))
 ;; (declare (uses debugprint.import))
-;; (declare (uses mtargs))
-;; (declare (uses mtargs.import))
 
 ;; (declare (uses ftail))
 ;; (import ftail)
 
-(import dbmod
+(import mtargs
+        debugprint
+	dbmod
 	commonmod
-	dbfile)
+	dbfile
+	tcp-transportmod
+        )
 
 (define *db* #f) ;; this is only for the repl, do not use in general!!!!
 
 (include "common_records.scm")
 (include "key_records.scm")
@@ -69,11 +79,11 @@
 (include "run_records.scm")
 (include "megatest-fossil-hash.scm")
 
 (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:)
      readline apropos json http-client directory-utils typed-records
-     http-client srfi-18 extras format)
+     http-client srfi-18 extras format tcp-server tcp)
 
 ;; Added for csv stuff - will be removed
 ;;
 (use sparse-vectors)
 
@@ -80,10 +90,14 @@
 (require-library mutils)
 
 (define *usage-log-file* #f)    ;; put path to file for logging usage in this var in the ~/.megatestrc file
 (define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file
 
+;; set some parameters here - these need to be put in something that can be loaded from other
+;; executables such as dashboard and mtutil
+;;
+(include "transport-mode.scm")
 (dbfile:db-init-proc db:initialize-main-db)
 
 ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file
 ;;
 (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc")))
@@ -373,10 +387,11 @@
 			"-load"        ;; load and exectute a scheme file
 			"-section"
 			"-var"
 			"-dumpmode"
 			"-run-id"
+			"-db"
 			"-ping"
 			"-refdb2dat"
 			"-o"
 			"-log"
                         "-sync-log"
@@ -585,16 +600,16 @@
 ;; where (launch:setup) returns #f?
 ;;
 (if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server
     (handle-exceptions
 	exn
-	(begin
-	  (print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
-	  )
+      (begin
+	(print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
       (let* ((tl   (or (args:get-arg "-log")(launch:setup)))   ;; run launch:setup if -server, ensure we do NOT run launch:setup if -log specified
+	     (dbname (args:get-arg "-db"))   ;; for the server logfile name
 	     (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name
-		       (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log")))
+		       (conc tl "/logs/server-"(or dbname "unk")"-"(current-process-id) "-" (get-host-name) ".log")))
 	     (oup  (open-logfile logf)))
 	(if (not (args:get-arg "-log"))
 	    (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log
 	(debug:print-info 0 *default-log-port* "Sending log output to " logf)
 	(set! *default-log-port* oup))))
@@ -654,23 +669,10 @@
 ;; for some switches always print the command to stderr
 ;;
 (if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
     (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))
 
-;; some switches imply homehost. Exit here if not on homehost
-;;
-(let ((homehost-required  (list "-cleanup-db")))
-  (if (apply args:any? homehost-required)
-      (if (not (server:choose-server *toppath* 'home?))
-	  (for-each
-	   (lambda (switch)
-	     (if (args:get-arg switch)
-		 (begin
-		   (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch
-				", you can move homehost by removing the .homehost file but this will disrupt any runs in progress.")
-		   (exit 1))))
-	   homehost-required))))
 
 ;;======================================================================
 ;; Misc setup stuff
 ;;======================================================================
 
@@ -934,13 +936,24 @@
 ;;======================================================================
 
 ;; Server? Start up here.
 ;;
 (if (args:get-arg "-server")
-    (let ((tl        (launch:setup))
-          (transport-type (string->symbol (or (args:get-arg "-transport") "http"))))
-      (server:launch 0 transport-type)
+    (let* (;; (run-id     (args:get-arg "-run-id"))
+	   (dbfname    (args:get-arg "-db"))
+	   (tl         (launch:setup))
+	   (keys       (keys:config-get-fields *configdat*)))
+      (case (rmt:transport-mode)
+	((http)(http-transport:launch))
+	((tcp)
+	 (debug:print 0 *default-log-port* "INFO: Running using tcp method.")
+	 (if dbfname
+	     (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)
+	     (begin
+	       (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.")
+	       (exit 1))))
+	(else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode))))
       (set! *didsomething* #t)))
 
 ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to
 ;; a specific Megatest area. Detail are being hashed out and this may change.
 ;;
@@ -953,20 +966,26 @@
         (args:get-arg "-kill-servers"))
     (let ((tl (launch:setup)))
       (if tl ;; all roads from here exit
 	  (let* ((servers (server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath*))
 		 (fmtstr  "~33a~22a~20a~20a~8a\n"))
-	    (format #t fmtstr "ID" "host:port" "age (hms)" "Last mod" "State")
-	    (format #t fmtstr "==" "=========" "=========" "========" "=====")
-	    (for-each ;;  ( mod-time host port start-time pid )
+            (if (not servers)
+              (begin
+                (debug:print-info 1 *default-log-port* "No servers found")
+                (exit)
+              )
+            )
+       	    (format #t fmtstr "PID" "host:port" "age (hms)" "Last mod" "State")
+	    (format #t fmtstr "===" "=========" "=========" "========" "=====")
+	    (for-each ;;  (ip-addr port? mod-time host port start-time pid )
 	     (lambda (server)
-	       (let* ((mtm (any->number (car server)))
+	       (let* ((mtm (any->number (caddr server)))
 		      (mod (if mtm (- (current-seconds) mtm) "unk"))
-		      (age (- (current-seconds)(or (any->number (list-ref server 3)) (current-seconds))))
-		      (url (conc (cadr server) ":" (caddr server)))
+		      (age (- (current-seconds)(or (any->number mtm) (current-seconds))))
 		      (pid (list-ref server 4))
-		      (alv (if (number? mod)(< mod 10) #f)))
+		      (url (conc (car server) ":" (cadr server)))
+		      (alv (if (number? mod)(< mod 360) #f)))
 		 (format #t
 			 fmtstr
 			 pid
 			 url
 			 (seconds->hr-min-sec age)
@@ -979,11 +998,10 @@
 		       (server:kill server)))))
 	     (sort servers (lambda (a b)
 			     (let ((ma (or (any->number (car a)) 9e9))
 				   (mb (or (any->number (car b)) 9e9)))
 			       (> ma mb)))))
-	    ;; (debug:print-info 1 *default-log-port* "Done with listservers")
 	    (set! *didsomething* #t)
 	    (exit))
 	  (exit))))
       ;; must do, would have to add checks to many/all calls below
 
@@ -1383,12 +1401,11 @@
 ;; IDEA: megatest list -runname blah% ...
 ;;
 (if (or (args:get-arg "-list-runs")
 	(args:get-arg "-list-db-targets"))
     (if (launch:setup)
-	(let* (;; (dbstruct    (make-dbr:dbstruct path: *toppath* local: (args:get-arg "-local")))
-	       (runpatt     (args:get-arg "-list-runs"))
+	(let* ((runpatt     (args:get-arg "-list-runs"))
                (access-mode (db:get-access-mode))
 	       (testpatt    (common:args-get-testpatt #f))
 	       ;; (if (args:get-arg "-testpatt") 
 	       ;;  	        (args:get-arg "-testpatt") 
 	       ;;  	        "%"))
@@ -1433,10 +1450,15 @@
 				  db:test-record-fields
 				  t)))
 	       (adj-tests-spec (delete-duplicates (if tests-spec (cons "id" tests-spec) db:test-record-fields))) ;; '("id"))))
 	       (steps-spec  (alist-ref "steps" fields-spec equal?))
 	       (test-field-index (make-hash-table)))
+	  (if (and (args:get-arg "-dumpmode")
+		   (not (member (args:get-arg "-dumpmode") '("sexpr" "json" "ods" "list"))))
+	      (begin
+		(debug:print 0 *default-log-port* "ERROR: dumpmode "(args:get-arg "-dumpmode")" not recognised. Use sexpr, json, ods or list")
+		(exit)))
 	  (if (and tests-spec (not (null? tests-spec))) ;; do some validation and processing of the test-spec
 	      (let ((invalid-tests-spec (filter (lambda (x)(not (member x db:test-record-fields))) tests-spec)))
 		(if (null? invalid-tests-spec)
 		    ;; generate the lookup map test-field-name => index-number
 		    (let loop ((hed (car adj-tests-spec))
@@ -1488,11 +1510,11 @@
 			;; (mutils:hierhash-set! data (conc (db:get-value-by-header run header "id"))  targetstr runname "meta" "id"         )
 			;; (mutils:hierhash-set! data (db:get-value-by-header run header "event_time") targetstr runname "meta" "event_time" )
 			;; (mutils:hierhash-set! data (db:get-value-by-header run header "comment")    targetstr runname "meta" "comment"    )
 			;; ;; add last entry twice - seems to be a bug in hierhash?
 			;; (mutils:hierhash-set! data (db:get-value-by-header run header "comment")    targetstr runname "meta" "comment"    )
-		       (else
+		       ((#f list)
 			(if (null? runs-spec)
 			    (print "Run: " targetstr "/" runname 
 				   " status: " (db:get-value-by-header run header "state")
 				   " run-id: " run-id ", number tests: " (length tests)
 				   " event_time: " (db:get-value-by-header run header "event_time"))
@@ -1504,11 +1526,14 @@
 			       (lambda (field-name)
 				 (if (equal? field-name "target")
 				     (display (conc "target: " targetstr " "))
 				     (display (conc field-name ": " (db:get-value-by-header run header (conc field-name)) " "))))
 			       runs-spec)
-			      (newline)))))
+			      (newline))))
+		       (else
+			(debug:print 0 *default-log-port* "ERROR: dumpmode "(args:get-arg "-dumpmode")" not recognised. Use sexpr, json, ods or list")
+			))
 		       
 		     (for-each 
 		      (lambda (test)
 		      	(common:debug-handle-exceptions #f
 			 exn
@@ -2054,11 +2079,11 @@
 (if (args:get-arg "-extract-ods")
     (general-run-call
      "-extract-ods"
      "Make ods spreadsheet"
      (lambda (target runname keys keyvals)
-       (let ((dbstruct   (make-dbr:dbstruct path: *toppath* local: #t))
+       (let ((dbstruct   (make-dbr:dbstruct areapath: *toppath* local: #t))
 	     (outputfile (args:get-arg "-extract-ods"))
 	     (runspatt   (or (args:get-arg "-runname")(args:get-arg ":runname")))
 	     (pathmod    (args:get-arg "-pathmod")))
 	     ;; (keyvalalist (keys->alist keys "%")))
 	 (debug:print 2 *default-log-port* "Extract ods, outputfile: " outputfile " runspatt: " runspatt " keyvals: " keyvals)
@@ -2321,10 +2346,16 @@
     (begin
       (if (not (launch:setup))
 	  (begin
 	    (debug:print 0 *default-log-port* "Failed to setup, exiting") 
 	    (exit 1)))
+
+;;      (if (not (server:choose-server *toppath* 'home?))
+;;	  (begin
+;;	    (debug:print 0 *default-log-port* "Servers are not running on this host or no servers alive. Cannot run cleanup-db")
+;;	    (exit 1)))
+
       (let ((dbstructs (db:setup #f)))
         (common:cleanup-db dbstructs))
       (set! *didsomething* #t)))
 
 (if (args:get-arg "-mark-incompletes")
@@ -2379,11 +2410,13 @@
 (if (or (getenv "MT_RUNSCRIPT")
 	(args:get-arg "-repl")
 	(args:get-arg "-load"))
     (let* ((toppath (launch:setup))
 	   (dbstructs (if (and toppath
-                               (server:choose-server toppath 'home?))
+			       ;; NOTE: server:choose-server is starting a server
+			       ;;   either add equivalent for tcp mode or ????
+                               #;(server:choose-server toppath 'home?))
                           (db:setup #t)
                           #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
       (if *toppath*
 	  (cond
 	   ((getenv "MT_RUNSCRIPT")

Index: mlaunch.scm
==================================================================
--- mlaunch.scm
+++ mlaunch.scm
@@ -28,6 +28,8 @@
 (use sqlite3 srfi-1 posix regex regex-case srfi-69 format)
 
 (declare (unit mlaunch))
 (declare (uses db))
 (declare (uses common))
+(declare (uses commonmod))
+(import commonmod)
 

Index: monitor.scm
==================================================================
--- monitor.scm
+++ monitor.scm
@@ -23,10 +23,12 @@
 (declare (unit runs))
 (declare (uses db))
 (declare (uses common))
 (declare (uses items))
 (declare (uses runconfig))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")

ADDED   mtargs/mtargs.egg
Index: mtargs/mtargs.egg
==================================================================
--- /dev/null
+++ mtargs/mtargs.egg
@@ -0,0 +1,7 @@
+((license "LGPL")
+ (version 0.1)
+ (category misc)
+ (dependencies srfi-69 srfi-1)
+ (author "Matt Welland")
+ (synopsis "Primitive argument processor.")
+ (components (extension mtargs)))

Index: mtargs/mtargs.scm
==================================================================
--- mtargs/mtargs.scm
+++ mtargs/mtargs.scm
@@ -18,28 +18,45 @@
 
 (module mtargs
     (
      arg-hash
      get-arg
+     get-arg-number
      get-arg-from
-     usage
      get-args
+     usage
      print-args
      any-defined?
-     help
-     )
-
-(import scheme chicken data-structures extras posix ports files)
-(use srfi-69 srfi-1)
-
-(define arg-hash (make-hash-table))
-(define help "")
+     ) 
+
+(import scheme) ;; gives us cond-expand in chicken-4
+
+(cond-expand
+ (chicken-5
+  (import scheme (chicken base) (chicken port) (chicken file) (chicken process-context))
+  (import srfi-69 srfi-1))
+ (chicken-4
+  (import chicken posix srfi-69 srfi-1))
+ (else))
+
+(define usage (make-parameter print))
+(define arg-hash (make-hash-table))
 
 (define (get-arg arg . default)
   (if (null? default)
       (hash-table-ref/default arg-hash arg #f)
       (hash-table-ref/default arg-hash arg (car default))))
+
+;; get an arg as a number
+(define (get-arg-number arg . default)
+  (let* ((val-str (get-arg arg))
+	 (val     (if val-str (string->number val-str) #f)))
+    (if val
+	val
+	(if (null? default)
+	    #f
+	    default))))
 
 (define (any-defined? . args)
   (not (null? (filter (lambda (x) x)
 		      (map get-arg args)))))
 
@@ -48,28 +65,10 @@
 (define (get-arg-from ht arg . default)
   (if (null? default)
       (hash-table-ref/default ht arg #f)
       (hash-table-ref/default ht arg (car default))))
 
-(define (usage . args)
-  (if (> (length args) 0)
-      (apply print "ERROR: " args))
-  (if (string? help)
-      (print help)
-      (print "Usage: " (car (argv)) " ... "))
-  (exit 0))
-
- ;; one-of args defined
-(define (any-defined? . param)
-  (let ((res #f))
-    (for-each 
-     (lambda (arg)
-       (if (get-arg arg)(set! res #t)))
-     param)
-    res))
-
-;; args: 
 (define (get-args args params switches arg-hash num-needed)
   (let* ((numtargs (length args))
 	 (adj-num-needed (if num-needed (+ num-needed 2) #f)))
     (if (< numtargs (if adj-num-needed adj-num-needed 2))
 	(if (>= num-needed 1)
@@ -94,13 +93,12 @@
 	   (else
 	    (if (null? tail)(append remtargs (list arg)) ;; return the non-used args
 		(loop (car tail)(cdr tail)(append remtargs (list arg))))))))
     ))
 
-(define (print-args remtargs arg-hash)
-  (print "ARGS: " remtargs)
+(define (print-args arg-hash)
   (for-each (lambda (arg)
 	      (print "   " arg "   " (hash-table-ref/default arg-hash arg #f)))
 	    (hash-table-keys arg-hash)))
 
 
 )

Index: mtexec.scm
==================================================================
--- mtexec.scm
+++ mtexec.scm
@@ -29,10 +29,12 @@
 
 ;; (declare (uses common))
 (declare (uses margs))
 (declare (uses configf))
 ;; (declare (uses rmt))
+(declare (uses commonmod))
+(import commonmod)
 
 ;; (use ducttape-lib)
 (include "megatest-version.scm")
 (include "megatest-fossil-hash.scm")
 

Index: mtut.scm
==================================================================
--- mtut.scm
+++ mtut.scm
@@ -30,10 +30,13 @@
 
 (declare (uses common))
 (declare (uses margs))
 (declare (uses configf))
 ;; (declare (uses rmt))
+(declare (uses commonmod))
+(declare (uses commonmod.import))
+(import commonmod)
 
 (use ducttape-lib)
 
 (include "megatest-fossil-hash.scm")
 

Index: newdashboard.scm
==================================================================
--- newdashboard.scm
+++ newdashboard.scm
@@ -29,10 +29,12 @@
      (prefix dbi dbi:))
 
 (declare (uses common))
 (declare (uses megatest-version))
 (declare (uses margs))
+(declare (uses commonmod))
+(import commonmod)
 
 ;; (declare (uses launch))
 ;; (declare (uses gutils))
 ;; (declare (uses db))
 ;; (declare (uses server))

Index: ods.scm
==================================================================
--- ods.scm
+++ ods.scm
@@ -17,10 +17,12 @@
 ;;
 
 (use csv-xml regex)
 (declare (unit ods))
 (declare (uses common))
+(declare (uses commonmod))
+(import commonmod)
 
 (define ods:dirs
   '("Configurations2"
     "Configurations2/toolpanel"
     "Configurations2/menubar"

Index: portlogger.scm
==================================================================
--- portlogger.scm
+++ portlogger.scm
@@ -64,11 +64,11 @@
      exn
      (begin
        ;; (release-dot-lock fname)
        (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params)
        (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-       (debug:print 5 *default-log-port* "exn=" (condition->list exn))
+       ;; (debug:print 5 *default-log-port* "exn=" (condition->list exn))
        (if (common:file-exists? fname)(delete-file fname)) ;; brutally get rid of it
        (print-call-chain (current-error-port)))
      (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
 	    (db     (portlogger:open-db fname))
 	    (res    (apply proc db params)))

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -21,16 +21,32 @@
 (use format typed-records) ;; RADT => purpose of json format??
 
 (declare (unit rmt))
 (declare (uses api))
 (declare (uses http-transport))
+(declare (uses commonmod))
 (declare (uses dbfile))
+;; (declare (uses dbmemmod))
+(declare (uses dbmod))
+(declare (uses tcp-transportmod))
 (include "common_records.scm")
 ;; (declare (uses rmtmod))
 
+;; used by http-transport
 (import dbfile) ;; rmtmod)
 
+(import commonmod
+;; 	dbmemmod
+	dbfile
+	dbmod
+	tcp-transportmod)
+
+;; http - use the old http + in /tmp db
+;; tcp  - use tcp transport with inmem db
+;; nfs  - use direct to disk access (read-only)
+;;
+(define rmt:transport-mode (make-parameter 'http))
 ;;
 ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
 ;;
 
 ;; generate entries for ~/.megatestrc with the following
@@ -42,61 +58,63 @@
 ;;======================================================================
 
 ;; if a server is either running or in the process of starting call client:setup
 ;; else return #f to let the calling proc know that there is no server available
 ;;
-(define (rmt:get-connection-info areapath #!key (area-dat #f)) ;; TODO: push areapath down.
-  (let* ((runremote (or area-dat *runremote*))
-	 (cinfo     (if (remote? runremote)
-			(remote-conndat runremote)
+(define (rmt:get-connection-info areapath runremote) ;; TODO: push areapath down.
+  (let* ((cinfo     (if (and (remote? runremote)
+			     (remote-api-url runremote)) ;; we have a connection
+			runremote
 			#f)))
-	  (if cinfo
-	      cinfo
-	      (if (server:check-if-running areapath)
-		  (client:setup areapath)
-		  #f))))
+    (if cinfo
+	cinfo
+	(if (server:check-if-running areapath)
+	    (client:setup areapath runremote)
+	    #f))))
 
 (define (rmt:on-homehost? runremote)
   (let* ((hh-dat (remote-hh-dat runremote)))
     (if (pair? hh-dat)
 	(cdr hh-dat)
 	(begin
 	  (debug:print-info 0 *default-log-port* "hh-dat="hh-dat)
 	  #f))))
 
+(define (make-and-init-remote areapath)
+   (case (rmt:transport-mode)
+     ((http)(make-remote))
+     ((tcp) (tt:make-remote areapath))
+     (else #f)))
 
 ;;======================================================================
 
 (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
 
 ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
 ;;
 (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
-
-  #;(common:telemetry-log (conc "rmt:"(->string cmd))
-                        payload: `((rid . ,rid)
-                                   (params . ,params)))
-
+  (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
   (if (> attemptnum 2)
       (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
-    
+  
   (cond
    ((> attemptnum 2) (thread-sleep! 0.05))
    ((> attemptnum 10) (thread-sleep! 0.5))
    ((> attemptnum 20) (thread-sleep! 1)))
+
+  ;; I'm turning this off, it may make sense to move it
+  ;; into http-transport-handler
   (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
-    (begin (server:run *toppath*) (thread-sleep! 3))) 
-  
-  
-  ;;DOT digraph megatest_state_status {
-  ;;DOT   ranksep=0;
-  ;;DOT   // rankdir=LR;
-  ;;DOT   node [shape="box"];
-  ;;DOT "rmt:send-receive" -> MUTEXLOCK;
-  ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; }
-  ;; do all the prep locked under the rmt-mutex
-  (mutex-lock! *rmt-mutex*)
+      (begin
+	(debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.")
+	(case (rmt:transport-mode)
+	  ((http)
+	   (server:run *toppath*)
+	   (thread-sleep! 3))
+	  (else
+	   (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server
+	   ))))
   
   ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
   ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
   ;; 3. do the query, if on homehost use local access
   ;;
@@ -103,249 +121,192 @@
   (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
          (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
 	 (runremote     (or area-dat
 			    *runremote*))
          (attemptnum    (+ 1 attemptnum))
-	 (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*)))
-
-    ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity
-    ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"];
-    ;; DOT INIT_RUNREMOTE -> MUTEXLOCK;
-    ;; ensure we have a record for our connection for given area
-    (if (not runremote)                   ;; can remove this one. should never get here.         
-	(begin
-	  (set! *runremote* (make-remote))
-          (let* ((server-info (remote-server-info *runremote*))) 
-            (if server-info
-		(begin
-			(remote-server-url-set! *runremote* (server:record->url server-info))
-			(remote-server-id-set! *runremote* (server:record->id server-info)))))  
-	  (set! runremote   *runremote*))) ;; new runremote will come from this on next iteration
-    
-    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
-    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
-    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
-    ;; ensure we have a homehost record
-    (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
-	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
-	(let ((hh-data (server:choose-server areapath 'homehost)))
-	  (remote-hh-dat-set! runremote (or hh-data (cons #f #f)))))
-    
-    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
-    (cond
-     #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
-      (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
-      (set! *runremote* #f)
-      ;; BUG: close-connections should go here?
-      (mutex-unlock! *rmt-mutex*)
-      (rmt:send-receive cmd rid params attemptnum: 1 area-dat: area-dat))
-     
-     ;;DOT EXIT;
-     ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" }
-     ;; give up if more than 150 attempts
-     ((> attemptnum 150)
-      (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.")
-      (exit 1))
-
-     ;;DOT CASE2 [label="local\nreadonly\nquery"];
-     ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2}
-     ;;DOT CASE2 -> "rmt:open-qry-close-locally";
-     ;; readonly mode, read request-  handle it - case 2
-     ((and readonly-mode
-           (member cmd api:read-only-queries)) 
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2")
-      (rmt:open-qry-close-locally cmd 0 params)
-      )
-
-     ;;DOT CASE3 [label="write in\nread-only mode"];
-     ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3}
-     ;;DOT CASE3 -> "#f";
-     ;; readonly mode, write request.  Do nothing, return #f
-     (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params))
-
-     ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
-     ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
-     ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
-     ;;
-     ;;DOT CASE4 [label="reset\nconnection"];
-     ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
-     ;;DOT CASE4 -> "rmt:send-receive";
-     ;; reset the connection if it has been unused too long
-     ((and runremote
-           (remote-conndat runremote)
-	   (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
-	      (+ (http-transport:server-dat-get-last-access (remote-conndat runremote))
-		 (remote-server-timeout runremote))))
-      (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
-      (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
-      (http-transport:close-connections area-dat: runremote)
-      (mutex-unlock! *rmt-mutex*)
-      (rmt:send-receive cmd rid params attemptnum: attemptnum))
-     
-     ;;DOT CASE5 [label="local\nread"];
-     ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
-     ;;DOT CASE5 -> "rmt:open-qry-close-locally";
-
-     ;; on homehost and this is a read
-     ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
-	   (rmt:on-homehost? runremote)
-           (member cmd api:read-only-queries))   ;; this is a read
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
-      (rmt:open-qry-close-locally cmd 0 params))
-
-     ;;DOT CASE6 [label="init\nremote"];
-     ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6};
-     ;;DOT CASE6 -> "rmt:send-receive";
-     ;; on homehost and this is a write, we already have a server, but server has died
-
-     ;; reinstate this keep-alive section but inject a time condition into the (add ...
-     
-     #;((and (cdr (remote-hh-dat runremote))           ;; on homehost
-           (not (member cmd api:read-only-queries))  ;; this is a write
-           (remote-server-url runremote)             ;; have a server
-           (not (server:ping (remote-server-url runremote) (remote-server-id runremote))))  ;; server has died. NOTE: this is not a cheap call! Need better approach.
-      (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6")
-      (http-transport:close-connections area-dat: runremote) ;; make sure to clean up
-      (set! *runremote* (make-remote))
-      (let* ((server-info (remote-server-info *runremote*))) 
-            (if server-info
-		(begin
-		  (remote-server-url-set! *runremote* (server:record->url server-info))
-                  (remote-server-id-set! *runremote* (server:record->id server-info)))))
-      (remote-force-server-set! runremote (common:force-server?))
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6")
-      (rmt:send-receive cmd rid params attemptnum: attemptnum))
-
-     ;;DOT CASE7 [label="homehost\nwrite"];
-     ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7};
-     ;;DOT CASE7 -> "rmt:open-qry-close-locally";
-     ;; on homehost and this is a write, we already have a server
-     ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
-	   (cdr (remote-hh-dat runremote))           ;; on homehost
-           (not (member cmd api:read-only-queries))  ;; this is a write
-           (remote-server-url runremote))            ;; have a server
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
-      (rmt:open-qry-close-locally cmd 0 params))
-
-     ;;DOT CASE8 [label="force\nserver"];
-     ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8};
-     ;;DOT CASE8 -> "rmt:open-qry-close-locally";
-     ;;  on homehost, no server contact made and this is a write, passively start a server 
-     ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
-	   (cdr (remote-hh-dat runremote))           ;; have homehost
-           (not (remote-server-url runremote))       ;; no connection yet
-	   (not (member cmd api:read-only-queries))) ;; not a read-only query
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
-      (let ((server-info  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
-	(if server-info
-	    (begin
-              (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed
-              (remote-server-id-set! runremote (server:record->id server-info)))  
-	    (if (common:force-server?)
-		(server:start-and-wait *toppath*)
-		(server:kind-run *toppath*)))
+	 (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))
+	 (testsuite     (common:get-testsuite-name))
+	 (mtexe         (common:find-local-megatest)))
+
+    (case (rmt:transport-mode)
+      ((http)(http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode))
+      ((tcp) (tcp-transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
+      ((nfs) (nfs:transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
+      )))
+
+(define (nfs:transport-handler  runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
+  (let* ((keys     (common:get-fields *configdat*))
+	 (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath)))
+    (api:dispatch-request dbstruct cmd run-id params)))
+	 
+(define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
+  (if (not runremote)
+      (let* ((newremote  (make-and-init-remote areapath)))
+	(set! *runremote* newremote)
+	(set! runremote newremote)))
+  (let* ((dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) ;;(dbfile:run-id->path areapath run-id)))
+    (tt:handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))
+	
+(define (http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode)
+    ;; do all the prep locked under the rmt-mutex
+  (mutex-lock! *rmt-mutex*)
+  
+  ;; ensure we have a record for our connection for given area
+  (if (not runremote)                   ;; can remove this one. should never get here.         
+      (begin
+	(set! *runremote* (make-and-init-remote areapath))
+        (let* ((server-info (remote-server-info *runremote*))) 
+          (if server-info
+	      (begin
+		(remote-server-url-set! *runremote* (server:record->url server-info))
+		(remote-server-id-set! *runremote* (server:record->id server-info)))))  
+	(set! runremote   *runremote*))) ;; new runremote will come from this on next iteration
+
+  ;; ensure we have a homehost record
+  (if (or (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
+	  (not (cdr (remote-hh-dat runremote))))   ;; not on homehost
+      (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
+      (let ((hh-data (server:choose-server areapath 'homehost)))
+	(remote-hh-dat-set! runremote (or hh-data (cons #f #f)))))
+  
+  (cond
+   ;; give up if more than 150 attempts
+   ((> attemptnum 150)
+    (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.")
+    (exit 1))
+
+   ;; readonly mode, read request-  handle it - case 2
+   ((and readonly-mode
+         (member cmd api:read-only-queries)) 
+    (mutex-unlock! *rmt-mutex*)
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2")
+    (rmt:open-qry-close-locally cmd 0 params)
+    )
+
+   ;; readonly mode, write request.  Do nothing, return #f
+   (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params))
+
+   ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
+   ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
+   ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
+   ;;
+   ;; reset the connection if it has been unused too long
+   ((and runremote
+         (remote-api-url runremote)
+	 (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
+	    (+ (remote-last-access runremote)
+	       (remote-server-timeout runremote))))
+    (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses in " (remote-server-timeout runremote) " seconds, forcing new connection.")
+    (http-transport:close-connections runremote)
+    ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections
+    ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
+    (mutex-unlock! *rmt-mutex*)
+    (rmt:send-receive cmd rid params attemptnum: attemptnum))
+   
+   ;; on homehost and this is a read
+   ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
+	 (rmt:on-homehost? runremote)
+         (member cmd api:read-only-queries))   ;; this is a read
+    (mutex-unlock! *rmt-mutex*)
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
+    (rmt:open-qry-close-locally cmd 0 params))
+
+   ;; on homehost and this is a write, we already have a server
+   ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
+	 (cdr (remote-hh-dat runremote))           ;; on homehost
+         (not (member cmd api:read-only-queries))  ;; this is a write
+         (remote-server-url runremote))            ;; have a server (needed to sync written data back)
+    (mutex-unlock! *rmt-mutex*)
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
+    (rmt:open-qry-close-locally cmd 0 params))
+
+   ;;  on homehost, no server contact made and this is a write, passively start a server 
+   ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
+	 (cdr (remote-hh-dat runremote))           ;; have homehost
+         (not (remote-server-url runremote))       ;; no connection yet
+	 (not (member cmd api:read-only-queries))) ;; not a read-only query
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
+    (let ((server-info  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
+      (if server-info
+	  (begin
+            (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed
+            (remote-server-id-set! runremote (server:record->id server-info)))  
+	  (if (common:force-server?)
+	      (server:start-and-wait *toppath*)
+	      (server:kind-run *toppath*)))
       (remote-force-server-set! runremote (common:force-server?))
       (mutex-unlock! *rmt-mutex*)
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8.1")
       (rmt:open-qry-close-locally cmd 0 params)))
 
-     ;;DOT CASE9 [label="force server\nnot on homehost"];
-     ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9};
-     ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive";
-     ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
-	       (not (remote-conndat runremote)))
-	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
-	       (not (remote-conndat runremote))))           ;; and no connection
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
-      (mutex-unlock! *rmt-mutex*)
-      (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
-	  (server:start-and-wait *toppath*))
-      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
-      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
-
-     ;;DOT CASE10 [label="on homehost"];
-     ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
-     ;;DOT CASE10 -> "rmt:open-qry-close-locally";
-     ;; all set up if get this far, dispatch the query
-     ((and (not (remote-force-server runremote))
-	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10")
-      (rmt:open-qry-close-locally cmd (if rid rid 0) params))
-
-     ;;DOT CASE11 [label="send_receive"];
-     ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11};
-     ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"];
-     ;;DOT CASE11 -> "RESULT" [label="call succeeded"];
-     ;; not on homehost, do server query
-     (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)))))
-    ;;DOT }
-
-;; No Title 
-;; Error: (vector-ref) out of range
-;; #(#<condition: (exn type)> (#("db.scm:3740: regex#regexp" #f #f) #("db.scm:3739: regex#string-substitute" #f #f) #("db.scm:3738: base64#base64-decode" #f #f) #("db.scm:3737: z3#z3:decode-buffer" #f #f) #("db.scm:3736: with-input-from-string" #f #f) #("db.scm:3741: s11n#deserialize" #f #f) #("api.scm:374: api:execute-requests" #f #f) #("api.scm:139: call-with-current-continuation" #f #f) #("api.scm:139: with-exception-handler" #f #f) #("api.scm:139: ##sys#call-with-values" #f #f) #("api.scm:158: string->symbol" #f #f) #("api.scm:160: current-milliseconds" #f #f) #("api.scm:161: dbr:dbstruct-read-only" #f #f) #("api.scm:139: k15" #f #f) #("api.scm:139: g19" #f #f) #("api.scm:142: get-call-chain" #f #f)) #("get-test-info-by-id" (1102 507299)))
-;; 6
-;; 
-;; 	Call history:
-;; 
-;; 	http-transport.scm:306: thread-terminate!	  
-;; 	http-transport.scm:307: debug:print-info	  
-;; 	common_records.scm:235: debug:debug-mode	  
-;; 	rmt.scm:259: k587	  
-;; 	rmt.scm:259: g591	  
-;; 	rmt.scm:276: http-transport:server-dat-update-last-access	  
-;; 	http-transport.scm:364: current-seconds	  
-;; 	rmt.scm:282: debug:print-info	  
-;; 	common_records.scm:235: debug:debug-mode	  
-;; 	rmt.scm:283: mutex-unlock!	  
-;; 	rmt.scm:287: extras-transport-succeded	  	<--
-;; +-----------------------------------------------------------------------------+
-;; | Exit Status    : 70  
-;;  
+   ;;DOT CASE9 [label="force server\nnot on homehost"];
+   ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9};
+   ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive";
+   ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
+	     (not (remote-api-url runremote)))
+	(and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
+	     (not (remote-api-url runremote))))           ;; and no connection
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " runremote: " (remote->alist runremote))
+    (mutex-unlock! *rmt-mutex*)
+    (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
+	(server:start-and-wait *toppath*))
+    ;; was: (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http
+    (set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http
+    (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
+
+   ;;DOT CASE10 [label="on homehost"];
+   ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
+   ;;DOT CASE10 -> "rmt:open-qry-close-locally";
+   ;; all set up if get this far, dispatch the query
+   ((and (not (remote-force-server runremote))
+	 (cdr (remote-hh-dat runremote))) ;; we are on homehost
+    (mutex-unlock! *rmt-mutex*)
+    (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10")
+    (rmt:open-qry-close-locally cmd (if rid rid 0) params))
+
+   ;;DOT CASE11 [label="send_receive"];
+   ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11};
+   ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"];
+   ;;DOT CASE11 -> "RESULT" [label="call succeeded"];
+   ;; not on homehost, do server query
+   (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))
 
 ;; bunch of small functions factored out of send-receive to make debug easier
 ;;
 
 (define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)
   ;; (mutex-unlock! *rmt-mutex*)
   (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9")
   ;; (mutex-lock! *rmt-mutex*)
-  (let* ((conninfo (remote-conndat runremote))
-	 (dat-in      (case (remote-transport runremote)
-		     ((http) (condition-case ;; handling here has
-					     ;; caused a lot of
-					     ;; problems. However it
-					     ;; is needed to deal with
-					     ;; attemtped
-					     ;; communication to
-					     ;; servers that have gone
-					     ;; away
-			      (http-transport:client-api-send-receive 0 conninfo cmd params)
-                              ((servermismatch)  (vector #f "Server id mismatch" ))
-			      ((commfail)(vector #f "communications fail"))
-			      ((exn)(vector #f "other fail" (print-call-chain)))))
-		     (else
-		      (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported")
-		      (exit))))
+  (let* (;; (conninfo (remote-conndat runremote))
+	 (dat-in  (condition-case ;; handling here has
+			     ;; caused a lot of
+			     ;; problems. However it
+			     ;; is needed to deal with
+			     ;; attemtped
+			     ;; communication to
+			     ;; servers that have gone
+			     ;; away
+			     (http-transport:client-api-send-receive 0 runremote cmd params)
+			     ;; (http-transport:client-api-send-receive 0 conninfo cmd params runremote)
+			     ((servermismatch)  (vector #f "Server id mismatch" ))
+			     ((commfail)(vector #f "communications fail"))
+			     ((exn)(vector #f "other fail" (print-call-chain)))))
 	 (dat      (if (and (vector? dat-in) ;; ... check it is a correct size
 			    (> (vector-length dat-in) 1))
 		       dat-in
 		       (vector #f (conc "communications fail (type 2), dat-in=" dat-in))))
 	 (success  (if (vector? dat) (vector-ref dat 0) #f))
 	 (res      (if (vector? dat) (vector-ref dat 1) #f)))
-    (if (and (vector? conninfo) (< 5 (vector-length conninfo)))
-	(http-transport:server-dat-update-last-access conninfo) ;; refresh access time
+    (if (and (remote? runremote)
+	     (remote-api-url runremote)) ;; (and (vector? conninfo) (< 5 (vector-length conninfo)))
+	(remote-last-access-set! runremote (current-seconds)) ;; refresh access time
 	(begin
-	  (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo)
-	  (set! conninfo #f)
-	  (remote-conndat-set! *runremote* #f) ;; NOTE: *runremote* is global copy of runremote. Purpose: factor out global.
-	  (http-transport:close-connections  area-dat: runremote)))
-    (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = " runremote)
+	  (debug:print 0 *default-log-port* "INFO: Should not get here! runremote="(remote->alist runremote))
+	  ;; (set! conninfo #f)
+	  (http-transport:close-connections runremote)))
+    (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. runremote=" (remote->alist runremote) " dat=" dat " runremote = " runremote)
     (mutex-unlock! *rmt-mutex*)
     (if success ;; success only tells us that the transport was
 	;; successful, have to examine the data to see if
 	;; there was a detected issue at the other end
 	(extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd)
@@ -392,11 +353,11 @@
     res))
 
 (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
   (let* ((qry-is-write    (not (member cmd api:read-only-queries)))
 	 (db-file-path    (db:dbfile-path)) ;;  0))
-	 (dbstructs-local (db:setup #t))  ;; make-dbr:dbstruct path:  dbdir local: #t)))
+	 (dbstructs-local (db:setup #t))
 	 (read-only       (not (file-write-access? db-file-path)))
 	 (start           (current-milliseconds))
 	 (resdat          (if (not (and read-only qry-is-write))
 			      (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params))))
 			;;	(handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong..
@@ -432,18 +393,13 @@
 		(mutex-lock! *db-multi-sync-mutex*)
 /		(set! *db-last-access* start-time)  ;; THIS IS PROBABLY USELESS? (we are on a client)
                 (mutex-unlock! *db-multi-sync-mutex*)))))
     res))
 
-(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
+(define (rmt:send-receive-no-auto-client-setup runremote cmd run-id params)
   (let* ((run-id   (if run-id run-id 0))
-	 (res  	   ;; (handle-exceptions
-		   ;;     exn
-		   ;;   (begin
-		   ;;     (print "transport failed. exn=" exn)
-		   ;;     #f)
-		     (http-transport:client-api-send-receive run-id connection-info cmd params))) ;; )
+	 (res  	   (http-transport:client-api-send-receive run-id runremote cmd params)))
     (if (and res (vector-ref res 0))
 	(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
 	#f)))
 
 ;;======================================================================
@@ -470,15 +426,12 @@
   (rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature))))
 
 ;; This login does no retries under the hood - it acts a bit like a ping.
 ;; Deprecated for nmsg-transport.
 ;;
-(define (rmt:login-no-auto-client-setup connection-info)
-  (case *transport-type* ;; run-id of 0 is just a placeholder
-    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version (client:get-signature))))
-    ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*)))
-    ))
+(define (rmt:login-no-auto-client-setup runremote)
+  (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature))))
 
 ;; hand off a call to one of the db:queries statements
 ;; added run-id to make looking up the correct db possible 
 ;;
 (define (rmt:general-call stmtname run-id . params)
@@ -519,13 +472,14 @@
 ;;======================================================================
 ;;  K E Y S 
 ;;======================================================================
 
 ;; These require run-id because the values come from the run!
+;; however the query must still apply to main.db
 ;;
 (define (rmt:get-key-val-pairs run-id)
-  (rmt:send-receive 'get-key-val-pairs run-id (list run-id)))
+  (rmt:send-receive 'get-key-val-pairs #f (list run-id)))
 
 (define (rmt:get-keys)
   (if *db-keys* *db-keys* 
      (let ((res (rmt:send-receive 'get-keys #f '())))
        (set! *db-keys* res)
@@ -548,11 +502,11 @@
 (define (rmt:get-targets)
   (rmt:send-receive 'get-targets #f '()))
 
 (define (rmt:get-target run-id)
   (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:send-receive 'get-target run-id (list run-id)))
+  (rmt:send-receive 'get-target #f (list run-id)))
 
 (define (rmt:get-run-times runpatt targetpatt)
   (rmt:send-receive 'get-run-times #f (list runpatt targetpatt ))) 
 
 
@@ -567,20 +521,21 @@
 
 (define (rmt:get-test-id run-id testname item-path)
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
 
-;; run-id is NOT used
-;;
 (define (rmt:get-test-info-by-id run-id test-id)
   (if (number? test-id)
       (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
       (begin
 	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
 	(print-call-chain (current-error-port))
 	#f)))
 
+(define (rmt:get-test-state-status-by-id run-id test-id)
+  (rmt:send-receive 'get-test-state-status-by-id run-id (list run-id test-id)))
+
 (define (rmt:test-get-rundir-from-test-id run-id test-id)
   (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))
 
 (define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
   (assert (number? run-id) "FATAL: Run id required.")
@@ -829,10 +784,13 @@
 
 (define (rmt:get-run-state run-id)
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'get-run-state #f (list run-id)))
 
+(define (rmt:get-run-state-status run-id)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmt:send-receive 'get-run-state-status #f (list run-id)))
 
 (define (rmt:set-run-status run-id run-status #!key (msg #f))
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'set-run-status #f (list run-id run-status msg)))
 
@@ -1044,23 +1002,36 @@
   (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id)))
 
 (define (rmt:test-get-archive-block-info archive-block-id)
   (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id)))
 
-
 (define (rmtmod:calc-ro-mode runremote *toppath*)
-  (if (and runremote
-	   (remote-ro-mode-checked runremote))
-      (remote-ro-mode runremote)
-      (let* ((mtcfgfile  (conc *toppath* "/megatest.config"))
-	     (ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
-	(if runremote
-	    (begin
-	      (remote-ro-mode-set! runremote ro-mode)
-	      (remote-ro-mode-checked-set! runremote #t)
-	      ro-mode)
-	    ro-mode))))
+  (case (rmt:transport-mode)
+    ((http)
+     (if (and runremote
+	      (remote-ro-mode-checked runremote))
+	 (remote-ro-mode runremote)
+	 (let* ((mtcfgfile  (conc *toppath* "/megatest.config"))
+		(ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
+	   (if runremote
+	       (begin
+		 (remote-ro-mode-set! runremote ro-mode)
+		 (remote-ro-mode-checked-set! runremote #t)
+		 ro-mode)
+	       ro-mode))))
+    ((tcp)
+     (if (and runremote
+	      (tt-ro-mode-checked runremote))
+	 (tt-ro-mode runremote)
+	 (let* ((mtcfgfile  (conc *toppath* "/megatest.config"))
+		(ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
+	   (if runremote
+	       (begin
+		 (tt-ro-mode-set! runremote ro-mode)
+		 (tt-ro-mode-checked-set! runremote #t)
+		 ro-mode)
+	       ro-mode))))))
 
 (define (extras-readonly-mode rmt-mutex log-port cmd params)
   (mutex-unlock! rmt-mutex)
   (debug:print-info 12 log-port "rmt:send-receive, case 3")
   (debug:print 0 log-port "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
@@ -1067,13 +1038,12 @@
   #f)
 
 (define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params)
   (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
   (mutex-lock! *rmt-mutex*)
-  (remote-conndat-set!    runremote #f)
-  (http-transport:close-connections area-dat: runremote)
-  (remote-server-url-set! runremote #f)
+  (http-transport:close-connections runremote)
+  ;; (remote-server-url-set! runremote #f)
   (mutex-unlock! *rmt-mutex*)
   (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
   (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
   
 (define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd)
@@ -1096,11 +1066,11 @@
 						 ;; want to ease off
 						 ;; the queries
       (let ((wait-delay (+ attemptnum (* attemptnum 10))))
 	(debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
 	(mutex-lock! *rmt-mutex*)
-	(http-transport:close-connections area-dat: runremote)
+	(http-transport:close-connections runremote)
 	(set! *runremote* #f) ;; force starting over
 	(mutex-unlock! *rmt-mutex*)
 	(thread-sleep! wait-delay)
 	(rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
       res)) ;; All good, return res

Index: runconfig.scm
==================================================================
--- runconfig.scm
+++ runconfig.scm
@@ -22,10 +22,12 @@
 
 (use format directory-utils)
 
 (declare (unit runconfig))
 (declare (uses common))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 
 (define (runconfig:read fname target environ-patt)
   (let ((ht (make-hash-table)))

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -22,10 +22,11 @@
      sxml-modifications matchable)
 
 (declare (unit runs))
 (declare (uses db))
 (declare (uses common))
+(declare (uses commonmod))
 (declare (uses items))
 (declare (uses runconfig))
 (declare (uses tests))
 (declare (uses server))
 (declare (uses mt))
@@ -37,10 +38,12 @@
 (include "db_records.scm")
 (include "run_records.scm")
 (include "test_records.scm")
 
 ;; (include "debugger.scm")
+
+(import commonmod)
 
 ;; use this struct to facilitate refactoring
 ;;
 
 (defstruct runs:dat
@@ -1279,15 +1282,23 @@
       (list hed tal reg reruns))
      
      ;; If no resources are available just kill time and loop again
      ;;
      ((not have-resources) ;; simply try again after waiting a second
-      (if (runs:lownoise "no resources" 60)
+      (if (runs:lownoise "no resources" 600)
 	  (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ..."))
+
       ;; Have gone back and forth on this but db starvation is an issue.
       ;; wait one second before looking again to run jobs.
-      (thread-sleep! 0.25)
+      ;; (thread-sleep! 0.25)
+      
+      ;; new logic.
+      ;; If it has been more than 10 seconds since we were last here don't wait at all
+      ;; otherwise sleep 2 seconds to give db a rest and let dashboard read data
+      (if (runs:lownoise "frequent-no-resources" 10)
+	  (thread-sleep! 0.25) ;; no significant delay
+	  (thread-sleep! 2))
       ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
       (list (car newtal)(cdr newtal) reg reruns))
      
      ;; This is the final stage, everything is in place so launch the test
      ;;
@@ -1775,11 +1786,11 @@
 			(last-jobs-check-time  (runs:dat-last-jobs-check-time runsdat))
 			(should-check-jobs     (match can-run-more-tests
 						 ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params)
 						  (if (< (- max-concurrent-jobs num-running) 25)
 						      (begin
-							(debug:print-info 0 *default-log-port*
+							(debug:print-info 2 *default-log-port*
 									  "less than 20 jobs headroom, ("max-concurrent-jobs
 									  "-"num-running")>20. Forcing prelaunch check.")
 							#t)
 						      #f))
 						 (else #f)))) ;; no record yet
@@ -1855,11 +1866,12 @@
                                    (newtestname (db:test-make-full-name hed my-item-path)))    ;; test names are unique on testname/item-path
                               (tests:testqueue-set-items!     new-test-record #f)
                               (tests:testqueue-set-itemdat!   new-test-record my-itemdat)
                               (tests:testqueue-set-item_path! new-test-record my-item-path)
                               (hash-table-set! test-records newtestname new-test-record)
-                              (set! tal (append tal (list newtestname)))))  ;; since these are itemized create new test names testname/itempath
+			      ;; BUG: This next line sucks up a lot of horsepower
+			      (set! tal (append tal (list newtestname)))))  ;; since these are itemized create new test names testname/itempath
                           items-in-testpatt)))
           
           
 
 	  ;; At this point we have possibly added items to tal but all must be handed off to 

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -65,21 +65,10 @@
 ;;======================================================================
 
 ;; Call this to start the actual server
 ;;
 
-;; all routes though here end in exit ...
-;;
-;; start_server
-;;
-(define (server:launch run-id transport-type)
-  (case transport-type
-    ((http)(http-transport:launch))
-    ;;((nmsg)(nmsg-transport:launch run-id))
-    ;;((rpc)  (rpc-transport:launch run-id))
-    (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))))
-
 ;;======================================================================
 ;; S E R V E R   U T I L I T I E S 
 ;;======================================================================
 
 ;; Get the transport
@@ -112,39 +101,32 @@
   (if *server-id* *server-id*
       (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic
         (set! *server-id* sig)
         *server-id*)))
 
-;; When using zmq this would send the message back (two step process)
-;; with spiffy or rpc this simply returns the return data to be returned
-;; 
-(define (server:reply return-addr query-sig success/fail result)
-  (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
-  ;; (send-message pubsock target send-more: #t)
-  ;; (send-message pubsock 
-  (case (server:get-transport)
-    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
-    ((http) (db:obj->string (vector success/fail query-sig result)))
-    ((fs)   result)
-    (else 
-     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
-     result)))
+;; ;; When using zmq this would send the message back (two step process)
+;; ;; with spiffy or rpc this simply returns the return data to be returned
+;; ;; 
+;; (define (server:reply return-addr query-sig success/fail result)
+;;   (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
+;;   ;; (send-message pubsock target send-more: #t)
+;;   ;; (send-message pubsock 
+;;   (case (server:get-transport)
+;;     ((rpc)  (db:obj->string (vector success/fail query-sig result)))
+;;     ((http) (db:obj->string (vector success/fail query-sig result)))
+;;     ((fs)   result)
+;;     (else 
+;;      (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
+;;      result)))
 
 ;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
 ;; if the target-host is set 
 ;; try running on that host
 ;;   incidental: rotate logs in logs/ dir.
 ;;
 (define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
-  (let* (;; (curr-host   (get-host-name))
-         ;; (attempt-in-progress (server:start-attempted? areapath))
-         ;; (dot-server-url (server:check-if-running areapath))
-	 ;; (curr-ip     (server:get-best-guess-address curr-host))
-	 ;; (curr-pid    (current-process-id))
-	 ;; (homehost    (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
-	 ;; (target-host (car homehost))
-	 (testsuite   (common:get-testsuite-name))
+  (let* ((testsuite   (common:get-testsuite-name))
 	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
 	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
 			   ""))
 	 (cmdln (conc (common:get-megatest-exe)
 		      " -server - ";; (or target-host "-")
@@ -190,46 +172,48 @@
   (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id
         (dbprep-rx    (regexp "^SERVER: dbprep"))
         (dbprep-found 0)
 	(bad-dat      (list #f #f #f #f #f)))
     (handle-exceptions
-	exn
-      (begin
-	(debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn)
-	bad-dat) ;; no idea what went wrong, call it a bad server
-      (with-input-from-file
-	  logf
-	(lambda ()
-	  (let loop ((inl  (read-line))
-		     (lnum 0))
-	    (if (not (eof-object? inl))
-		(let ((mlst (string-match server-rx inl))
-                      (dbprep (string-match dbprep-rx inl)))
-                  (if dbprep (set! dbprep-found 1))
-		  (if (not mlst)
-		      (if (< lnum 500) ;; give up if more than 500 lines of server log read
-			  (loop (read-line)(+ lnum 1))
-			  (begin 
+     exn
+     (begin
+       ;; WARNING: this is potentially dangerous to blanket ignore the errors
+       (if (file-exists? logf)
+	   (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn))
+       bad-dat) ;; no idea what went wrong, call it a bad server
+     (with-input-from-file
+	 logf
+       (lambda ()
+	 (let loop ((inl  (read-line))
+		    (lnum 0))
+	   (if (not (eof-object? inl))
+	       (let ((mlst (string-match server-rx inl))
+		     (dbprep (string-match dbprep-rx inl)))
+		 (if dbprep (set! dbprep-found 1))
+		 (if (not mlst)
+		     (if (< lnum 500) ;; give up if more than 500 lines of server log read
+			 (loop (read-line)(+ lnum 1))
+			 (begin 
                            (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )
                            bad-dat))
-		      (match mlst
-			((_ host port start server-id pid)
-			 (list host
-			       (string->number port)
-			       (string->number start)
-			       server-id
-			       (string->number pid)))
-			(else
-			 (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst)
-			 bad-dat))))
-                (begin 
-                  (if dbprep-found
-                      (begin
-                         (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
-                         (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
-                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
-		  bad-dat))))))))
+		     (match mlst
+			    ((_ host port start server-id pid)
+			     (list host
+				   (string->number port)
+				   (string->number start)
+				   server-id
+				   (string->number pid)))
+			    (else
+			     (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst)
+			     bad-dat))))
+	       (begin 
+		 (if dbprep-found
+		     (begin
+		       (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
+		       (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
+		     (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
+		 bad-dat))))))))
 
 ;; ;; get a list of servers from the log files, with all relevant data
 ;; ;; ( mod-time host port start-time pid )
 ;; ;;
 ;; (define (server:get-list areapath #!key (limit #f))
@@ -419,11 +403,12 @@
 
 ;; oldest server alive determines host then choose random of youngest
 ;; five servers on that host
 ;;
 (define (server:get-servers-info areapath)
-  (let* ((servinfodir (conc *toppath*"/.servinfo")))
+  ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.")
+  (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo")))
     (if (not (file-exists? servinfodir))
 	(create-directory servinfodir))
     (let* ((allfiles    (glob (conc servinfodir"/*")))
 	   (res         (make-hash-table)))
       (for-each
@@ -432,15 +417,45 @@
 		(serverdat (server:logf-get-start-info f)))
 	   (match serverdat
 	     ((host port start server-id pid)
 	      (if (and host port start server-id pid)
 		  (hash-table-set! res hostport serverdat)
-		  (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat)))
+		  (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))
 	     (else
-	      (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat)))))
+	      (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))))
        allfiles)
       res)))
+
+;; check the .servinfo directory, are there other servers running on this
+;; or another host?
+;;
+;; returns #t => ok to start another server
+;;         #f => not ok to start another server
+;;
+(define (server:minimal-check areapath)
+  (server:clean-up-old areapath)
+  (let* ((srvdir      (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo"))
+	 (servrs      (glob (conc srvdir"/*")))
+	 (thishostip  (server:get-best-guess-address (get-host-name)))
+	 (thisservrs  (glob (conc srvdir"/"thishostip":*")))
+	 (homehostinf (server:choose-server areapath 'homehost))
+	 (havehome    (car homehostinf))
+	 (wearehome   (cdr homehostinf)))
+    (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome
+		      ", numservers: "(length thisservrs))
+    (cond
+     ((not havehome) #t) ;; no homehost yet, go for it
+     ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another
+     ((and havehome (not wearehome)) #f)     ;; we are not the home host
+     ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running
+     (else
+      (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs)
+      #t))))
+	 
+
+(define server-last-start 0)
+
 
 ;; oldest server alive determines host then choose random of youngest
 ;; five servers on that host
 ;;
 ;; mode:
@@ -453,29 +468,47 @@
   ;;   1. sort by age ascending and ping until good
   ;; find alive rand from youngest
   ;;   1. sort by age descending
   ;;   2. take five
   ;;   3. check alive, discard if not and repeat
+  ;; first we clean up old server files
+  (assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode))
+  (server:clean-up-old areapath)
+  (let* ((since-last (- (current-seconds) server-last-start))
+        (server-start-delay 10))     
+    (if ( < (- (current-seconds) server-last-start) 10 )
+      (begin
+        (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
+        (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds")
+        (thread-sleep! server-start-delay)
+      )
+      (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
+    )
+  )
   (let* ((serversdat  (server:get-servers-info areapath))
 	 (servkeys    (hash-table-keys serversdat))
-	 (by-time-asc (if (not (null? servkeys))
+	 (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last
 			  (sort servkeys ;; list of "host:port"
 				(lambda (a b)
 				  (>= (list-ref (hash-table-ref serversdat a) 2)
 				      (list-ref (hash-table-ref serversdat b) 2))))
 			  '())))
+    (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat)
+    (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys)
     (if (not (null? by-time-asc))
 	(let* ((oldest     (last by-time-asc))
 	       (oldest-dat (hash-table-ref serversdat oldest))
 	       (host       (list-ref oldest-dat 0))
 	       (all-valid  (filter (lambda (x)
 				     (equal? host (list-ref (hash-table-ref serversdat x) 0)))
 				   by-time-asc))
-	       (best-five  (lambda ()
-			     (if (> (length all-valid) 5)
-				 (take all-valid 5)
-				 all-valid)))
+	       (best-ten  (lambda ()
+			     (if (> (length all-valid) 11)
+				 (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out
+				 (if (> (length all-valid) 8)
+				     (drop-right all-valid 1)
+				     all-valid))))
 	       (names->dats (lambda (names)
 			      (map (lambda (x)
 				     (hash-table-ref serversdat x))
 				   names)))
 	       (am-home?    (lambda ()
@@ -483,44 +516,81 @@
 				     (bestadrs (server:get-best-guess-address currhost)))
 				(or (equal? host currhost)
 				    (equal? host bestadrs))))))
 	  (case mode
 	    ((info)
-	     (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
-	     (print "youngest: "(hash-table-ref serversdat (car all-valid))))
+	     (debug:print 0 *default-log-port* "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
+	     (debug:print 0 *default-log-port* "youngest: "(hash-table-ref serversdat (car all-valid))))
 	    ((home)     host)
 	    ((homehost) (cons host (am-home?))) ;; shut up old code
 	    ((home?)    (am-home?))
-	    ((best-five)(names->dats (best-five)))
+	    ((best-ten)(names->dats (best-ten)))
 	    ((all-valid)(names->dats all-valid))
-	    ((best)     (let* ((best-five (best-five))
-			       (len       (length best-five)))
-			  (hash-table-ref serversdat (list-ref best-five (random len)))))
+	    ((best)     (let* ((best-ten (best-ten))
+			       (len       (length best-ten)))
+			  (hash-table-ref serversdat (list-ref best-ten (random len)))))
 	    ((count)(length all-valid))
 	    (else
 	     (debug:print 0 *default-log-port* "ERROR: invalid command "mode)
 	     #f)))
 	(begin
 	  (server:run areapath)
-	  (thread-sleep! 3)
+          (set! server-last-start (current-seconds))
+	  ;; (thread-sleep! 3)
 	  (case mode
 	    ((homehost) (cons #f #f))
 	    (else	#f))))))
+
+(define (server:get-servinfo-dir areapath)
+  (let* ((spath (conc areapath"/.servinfo")))
+    (if (not (file-exists? spath))
+	(create-directory spath #t))
+    spath))
+
+(define (server:clean-up-old areapath)
+  ;; any server file that has not been touched in ten minutes is effectively dead
+  (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*"))))
+    (for-each
+     (lambda (sfile)
+       (let* ((modtime (handle-exceptions
+			   exn
+			 (begin
+			   (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile)
+			   (current-seconds))
+			 (file-modification-time sfile))))
+	 (if (and (number? modtime)
+		  (> (- (current-seconds) modtime)
+		     600))
+	     (begin
+	       (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.")
+	       (handle-exceptions
+		   exn
+		 (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile)
+		 (delete-file sfile))))))
+     sfiles)))
 
 ;; would like to eventually get rid of this
 ;;
 (define (common:on-homehost?)
-  (server:choose-server *toppath* 'home?))
+  (if (eq? (rmt:transport-mode) 'http)
+      (server:choose-server *toppath* 'home?)
+      #t)) ;; there is no homehost for tcp and nfs is always on home so #t should work
 
 ;; kind start up of server, wait before allowing another server for a given
 ;; area to be launched
 ;;
 (define (server:kind-run areapath)
   ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
   ;; and wait for it to be at least <server idletime> seconds old
   ;; (server:wait-for-server-start-last-flag areapath)
-  (if (< (server:choose-server areapath 'count) 10)
+  (let loop ()
+    (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2)
+	(begin
+	  (if (common:low-noise-print 30 "our-host-load")
+	      (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server."))
+	  (loop))))
+  (if (< (server:choose-server areapath 'count) 20)
       (server:run areapath))
   #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
       (let* ((lock-file    (conc areapath "/logs/server-start.lock")))
 	(let* ((start-flag (conc areapath "/logs/server-start-last")))
 	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
@@ -538,11 +608,12 @@
     (let loop ((server-info (server:check-if-running areapath))
 	       (try-num    0))
       (if (or server-info
 	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
 	  (server:record->url server-info)
-	  (let ((num-ok (length (server:choose-server areapath 'all-valid))))
+	  (let* ( (servers (server:choose-server areapath 'all-valid))
+                (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0)))
 	    (if (and (> try-num 0)  ;; first time through simply wait a little while then try again
 		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
 		(server:run areapath))
 	    (thread-sleep! 5)
 	    (loop (server:check-if-running areapath)
@@ -555,11 +626,11 @@
 
 ;; no longer care if multiple servers are started by accident. older servers will drop off in time.
 ;;
 (define (server:check-if-running areapath) ;;  #!key (numservers "2"))
   (let* ((ns            (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
-	 (servers       (server:choose-server areapath 'best-five))) ;; (server:get-best (server:get-list areapath))))
+	 (servers       (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath))))
     (if (or (and servers
 		 (null? servers))
 	    (not servers))
 	    ;; (and (list? servers)
 	    ;;	 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
@@ -587,57 +658,53 @@
   (handle-exceptions
     exn
     (begin 
       (debug:print-info 0 *default-log-port*  "Unable to get host and/or port from " servr ", exn=" exn)     
     #f)
-  (match-let (((mod-time hostname port start-time server-id pid)
+  (match-let (((hostname port start-time server-id pid)
 	       servr))
     (tasks:kill-server hostname pid))))
 
 ;; called in megatest.scm, host-port is string hostname:port
 ;;
 ;; NOTE: This is NOT called directly from clients as not all transports support a client running
 ;;       in the same process as the server.
 ;;
-(define (server:ping host-port-in server-id #!key (do-exit #f))
-  (let ((host:port (if (not host-port-in) ;; use read-dotserver to find
-		       #f ;; (server:check-if-running *toppath*)
-		;; (if (number? host-port-in) ;; we were handed a server-id
-		;; 	   (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
-		;; 	     ;; (print "srec: " srec " host-port-in: " host-port-in)
-		;; 	     (if srec
-		;; 		 (conc (vector-ref srec 3) ":" (vector-ref srec 4))
-		;; 		 (conc "no such server-id " host-port-in)))
-		       host-port-in))) ;; )
-    (let* ((host-port (if host:port
-			  (let ((slst (string-split   host:port ":")))
-			    (if (eq? (length slst) 2)
-				(list (car slst)(string->number (cadr slst)))
-				#f))
-			  #f)))
-;;	   (toppath       (launch:setup)))
-      ;; (print "host-port=" host-port)
-      (if (not host-port)
-	  (begin
-	    (if host-port-in
-		(debug:print 0 *default-log-port*  "ERROR: bad host:port"))
-	    (if do-exit (exit 1))
-	    #f)
-	  (let* ((iface      (car host-port))
-		 (port       (cadr host-port))
-		 (server-dat (http-transport:client-connect iface port server-id))
-		 (login-res  (rmt:login-no-auto-client-setup server-dat)))
-	    (if (and (list? login-res)
-		     (car login-res))
-		(begin
-		  ;; (print "LOGIN_OK")
-		  (if do-exit (exit 0))
-		  #t)
-		(begin
-		  ;; (print "LOGIN_FAILED")
-		  (if do-exit (exit 1))
-		  #f)))))))
+(define (server:ping host:port server-id #!key (do-exit #f))
+  (let* ((host-port (cond
+		     ((string? host:port)
+		      (let ((slst (string-split   host:port ":")))
+			(if (eq? (length slst) 2)
+			    (list (car slst)(string->number (cadr slst)))
+			    #f)))
+		     (else
+		      #f))))
+    (cond
+     ((and (list? host-port)
+	   (eq? (length host-port) 2))
+      (let* ((myrunremote (make-and-init-remote *toppath*))
+	     (iface       (car host-port))
+	     (port        (cadr host-port))
+	     (server-dat  (client:connect iface port server-id myrunremote))
+	     (login-res   (rmt:login-no-auto-client-setup myrunremote)))
+	(http-transport:close-connections myrunremote)
+	(if (and (list? login-res)
+		 (car login-res))
+	    (begin
+	      ;; (print "LOGIN_OK")
+	      (if do-exit (exit 0))
+	      #t)
+	    (begin
+	      ;; (print "LOGIN_FAILED")
+	      (if do-exit (exit 1))
+	      #f))))
+     (else 
+      (if host:port
+	  (debug:print 0 *default-log-port*  "ERROR: bad host:port "host:port))
+      (if do-exit
+	  (exit 1)
+	  #f)))))
 
 ;; run ping in separate process, safest way in some cases
 ;;
 (define (server:ping-server ifaceport)
   (with-input-from-pipe 
@@ -668,11 +735,11 @@
 (define (server:expiration-timeout)
   (let ((tmo (configf:lookup *configdat* "server" "timeout")))
     (if (and (string? tmo)
 	     (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below
         (* 3600 (string->number tmo))
-	60)))
+	600)))
 
 (define (server:get-best-guess-address hostname)
   (let ((res #f))
     (for-each 
      (lambda (adr)

Index: subrun.scm
==================================================================
--- subrun.scm
+++ subrun.scm
@@ -23,17 +23,20 @@
      call-with-environment-variables)
 (declare (unit subrun))
 ;;(declare (uses runs))
 (declare (uses db))
 (declare (uses common))
+(declare (uses commonmod))
 ;;(declare (uses items))
 ;;(declare (uses runconfig))
 ;;(declare (uses tests))
 ;;(declare (uses server))
 (declare (uses mt))
 ;;(declare (uses archive))
 ;; (declare (uses filedb))
+
+(import commonmod)
 
 ;(include "common_records.scm")
 ;;(include "key_records.scm")
 (include "db_records.scm") ;; provides db:test-get-id
 ;;(include "run_records.scm")
@@ -135,11 +138,11 @@
       (subrun:unset-subrun-removed test-run-dir))      
 
   (let* ((log-prefix "run")
          (switches (subrun:selector+log-switches test-run-dir log-prefix))
          (run-wait (equal? run-mode "yes"))
-         (cmd      (conc "megatest " sub-cmd " " switches" "
+         (cmd      (conc (common:get-mtexe)" "sub-cmd" "switches" "
                          (if run-wait "-run-wait " ""))))
     cmd))
 
 
 (define (subrun:sanitize-path inpath)
@@ -232,20 +235,24 @@
                (list (car x) (cdr x)))
              switch-alist))
            " ")))
     res))
 
+;; NOTE: Here we run sub megatest but this is not intended for one version
+;;       of megatest to test another version. Thus we propagate the 
 (define (subrun:exec-sub-megatest test-run-dir action-switches-str log-prefix)
-  (let* ((selector-switches  (subrun:selector+log-switches test-run-dir log-prefix))
-         (cmd (conc "megatest " selector-switches " " action-switches-str ))
+  (let* ((mtpathdir          (common:get-megatest-exe-dir))
+	 (mtexe              (common:get-mtexe))
+	 (selector-switches  (subrun:selector+log-switches test-run-dir log-prefix))
+         (cmd (conc mtexe" "selector-switches" "action-switches-str ))
          (pid #f)
          (proc (lambda ()
                  (debug:print-info 0 *default-log-port* "Running sub megatest command: "cmd)
                  ;;(set! pid (process-run "/usr/bin/xterm" (list ))))))
                  (set! pid (process-run "/bin/bash" (list "-c" cmd))))))
     (call-with-environment-variables 
-     (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
+     (list (cons "PATH"  (common:get-megatest-exe-path)))
      (lambda  ()
        (common:without-vars proc "^MT_.*")))
     (let processloop ((i 0))
       (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
         (if (eq? pid-val 0)

Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -25,10 +25,12 @@
 (declare (uses dbfile))
 (declare (uses db))
 (declare (uses rmt))
 (declare (uses common))
 (declare (uses pgdb))
+(declare (uses commonmod))
+(import commonmod)
 
 (import dbfile)
 ;; (import pgdb) ;; pgdb is a module
 
 (include "task_records.scm")

Index: tcmt.scm
==================================================================
--- tcmt.scm
+++ tcmt.scm
@@ -30,10 +30,12 @@
 
 (declare (uses margs))
 (declare (uses rmt))
 (declare (uses common))
 ;; (declare (uses megatest-version))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "megatest-version.scm")
 (include "megatest-fossil-hash.scm")
 (include "db_records.scm")
 

ADDED   tcp-transportmod.scm
Index: tcp-transportmod.scm
==================================================================
--- /dev/null
+++ tcp-transportmod.scm
@@ -0,0 +1,687 @@
+;;======================================================================
+;; Copyright 2017, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+;;======================================================================
+
+(declare (unit tcp-transportmod))
+(declare (uses debugprint))
+(declare (uses commonmod))
+(declare (uses dbfile))
+(declare (uses dbmod))
+
+(use address-info)
+
+(module tcp-transportmod
+	*
+	
+  (import scheme
+	  (prefix sqlite3 sqlite3:)
+	  chicken
+	  data-structures
+
+	  address-info
+	  directory-utils
+	  extras
+	  files
+	  hostinfo
+	  matchable
+	  md5
+	  message-digest
+	  ports
+	  posix
+	  regex
+	  regex-case
+	  s11n
+	  srfi-1
+	  srfi-18
+	  srfi-4
+	  srfi-69
+	  stack
+	  typed-records
+	  tcp-server
+	  tcp
+	  
+	  debugprint
+	  commonmod
+	  dbfile
+	  dbmod
+	)
+
+;;======================================================================
+;; client
+;;======================================================================
+
+;; (define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic
+
+;; Used ONLY for client
+;;
+(defstruct tt-conn
+  host
+  port
+  host-port
+  dbfname
+  server-id
+  server-start
+  pid
+)
+
+;; Used for BOTH clients and servers
+(defstruct tt
+  ;; client related
+  (conns (make-hash-table)) ;; dbfname -> conn
+
+  ;; server related
+  (areapath     #f)
+  (host         #f)
+  (port         #f)
+  (conn         #f)
+  (cleanup-proc #f)
+  (handler      #f) ;; receives data and responds
+  (socket       #f)
+  (thread       #f)
+  (host-port    #f)
+  (cmd-thread   #f)
+  (ro-mode      #f)
+  (ro-mode-checked #f)
+  (last-access  (current-seconds))
+  (servinf-file #f)
+  (last-serv-start 0)
+  )
+
+(define (tt:make-remote areapath)
+  (make-tt areapath: areapath))
+
+;; 1 ... or #f
+(define (tt:valid-run-id run-id)
+  (or (number? run-id)
+      (not run-id)))
+
+;; do all the busy work of finding and setting up conn for
+;; connecting to a server
+;; 
+(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
+  (assert (tt:valid-run-id run-id) "FATAL: invalid run-id "run-id)
+  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
+	 (server-start-proc (lambda ()
+			      (tt:server-process-run
+			       (tt-areapath ttdat)
+			       testsuite ;; (dbfile:testsuite-name)
+			       (common:find-local-megatest)
+			       run-id))))
+    (if conn
+	conn ;; we are already connected to the server
+	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
+	  (match sdat
+	    ((host port start-time server-id pid dbfname2 servinffile)
+	     (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
+	     (let* ((host-port (conc host":"port))
+		    (conn (make-tt-conn
+			   host: host
+			   port: port
+			   host-port: host-port
+			   dbfname: dbfname
+			   servinf-file: servinffile
+			   server-id: server-id
+			   server-start: start-time
+			   pid: pid)))
+	       (hash-table-set! (tt-conns ttdat) dbfname conn)
+	       ;; verify we can talk to this server
+	       (if (tt:ping host port server-id)
+		   conn
+		   (let* ((curr-secs (current-seconds)))
+		     ;; rm the (last server) would go here
+		     (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
+			 (begin
+			   (tt-last-serv-start-set! ttdat curr-secs)
+			   (server-start-proc))) ;; don't try and start server unless 30 sec has gone by since last attempt
+		     (thread-sleep! 1)
+		     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))
+	    (else
+	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; really do not want to swamp the machine with servers
+		 (begin
+		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
+		   (server-start-proc)
+		   (tt-last-serv-start-set! ttdat (current-seconds))))
+	     (thread-sleep! 1)
+	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
+    
+(define (tt:ping host port server-id)
+  (let*  ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id
+    ;;
+    ;; need two threads, one a 5 second timer
+    ;;
+    (match res
+      ((status errmsg result meta)
+       (if (equal? result server-id)
+	   (begin
+	     ;; (debug:print 0 *default-log-port* "Ping to "host":"port" successful.")
+	     #t) ;; then we are good
+	   (begin
+	     (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result)
+	     #f)))
+      (else
+       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
+       #f))))
+
+;; client side handler
+;;
+;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
+;;
+(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
+  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
+  (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
+    (if conn
+	;; have connection, call the server
+	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
+	  ;; res is (status errmsg result meta)
+	  (match res
+	    ((status errmsg result meta)
+	     (if (list? meta)
+		 (let* ((delay-wait (alist-ref 'delay-wait meta)))
+		   (if (and (number? delay-wait)
+			    (> delay-wait 0))
+		       (begin
+			 (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
+			 (thread-sleep! delay-wait)))))
+	     (case status
+	       ((busy) ;; result will be how long the server wants you to delay
+		(debug:print 0 *default-log-port* "WARNING: server is overloaded, will try again in "result" seconds.")
+		(thread-sleep! (if (number? result) result 2))
+		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+	       ((loaded)
+		(debug:print 0 *default-log-port* "WARNING: server is loaded, will try again in a 1/4 second.")
+		(thread-sleep! 0.25)
+		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+	       (else
+		result)))
+	    (else
+	     (if (not res)
+		 (let* ((host    (tt-conn-host conn))
+			(port    (tt-conn-port conn))
+			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
+			(pid     (tt-conn-pid  conn))
+			(servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname)))
+		   (hash-table-set! (tt-conns ttdat) dbfname #f)
+		   (if (file-exists? servinf)
+		       (begin
+			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", attempting to remove servinfo file.")
+			 (delete-file* servinf))
+		       (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf))
+		   (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+		 (assert #f "FATAL: tt:handler received bad data "res)))))
+	(begin
+	  (thread-sleep! 1) ;; give it a rest and try again
+	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))
+
+	;; no conn yet, find and or start and find a server
+;; 	(let* ((server (tt:find-server ttdat dbfname)))
+;; 	  (if server
+;; 	      (let* ((conn (tt:client-connect-to-server server)))
+;; 		(hash-table-set! (tt-conns ttdat) dbfname conn)
+;; 		(tt:handler  ttdat cmd run-id params attemptnum area-dat areapath readonly-mode
+;; 			     dbfname testsuite mtexe))
+;; 	      ;; no server, try to start a server process
+;; 	      (begin
+;; 		(tt:server-process-run areapath testsuite mtexe run-id) ;;  #!key (profile-mode "")) 
+;; 		(thread-sleep! 1)
+;; 		(tt:handler  ttdat cmd run-id params attemptnum area-dat areapath
+;; 			     readonly-mode dbfname testsuite mtexe)))))))
+
+(define (tt:bid-for-servership run-id)
+  #f)
+
+;; gets server info and appends path to server file
+;; sorts by age, oldest first
+;;
+;; returns list of (host port startseconds server-id servinfofile)
+;;
+(define (tt:get-server-info-sorted ttdat dbfname)
+  (let* ((areapath (tt-areapath ttdat))
+	 (sfiles   (tt:find-server areapath dbfname))
+	 (sdats    (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read
+	 (sorted   (sort sdats (lambda (a b)
+				 (< (list-ref a 2)(list-ref b 2))))))
+    sorted))
+    
+(define (tt:get-current-server-info ttdat dbfname)
+  (assert (tt-areapath ttdat) "FATAL: areapath not set in ttdat.")
+  ;;
+  ;; TODO - replace most of below with tt;get-server-info-sorted
+  ;;
+  (let* ((areapath (tt-areapath ttdat))
+	 (sfiles   (tt:find-server areapath dbfname))
+	 (sdats    (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read
+	 (sorted   (sort sdats (lambda (a b)
+				 (< (list-ref a 2)(list-ref b 2))))))
+    (if (null? sorted)
+	#f  ;; we'll want to wait until extra servers have exited
+	(car sorted))))
+
+(define (tt:send-receive ttdat conn cmd run-id params)
+  (let* ((host-port (tt-conn-host-port conn)) ;; (conc (tt-conn-host conn)":"(tt-conn-port conn)))
+	 (host      (tt-conn-host conn))
+	 (port      (tt-conn-port conn))
+	 (dat       (list cmd run-id params #f))) ;; no meta data yet
+    (tt:send-receive-direct host port dat)))
+
+(define (tt:send-receive-direct host port dat)
+  (assert (number? port) "FATAL: tt:send-receive-direct called with port not a number "port)
+  (handle-exceptions
+      exn
+    #f ;; Add condition-case or better handling here
+    (let-values (((inp oup)(tcp-connect host port)))
+      (let ((res (if (and inp oup)
+		     (begin
+		       (serialize dat oup)
+		       (close-output-port oup)
+		       (deserialize inp))
+		     )))
+	(close-input-port inp)
+	res))))
+
+
+
+;;======================================================================
+;; server
+;;======================================================================
+
+(define (tt:sync-dbs ttdat)
+  #f)
+
+;; start the listener and start responding to requests
+;;
+;; NOTE: organise by dbfname, not run-id so we don't need
+;;       to pull in more modules
+;;
+;; This is the routine called in megatest.scm to start a server
+;;
+;; Server viability is checked in keep-running. Blindly start and run here.
+;;
+(define (tt:start-server areapath run-id dbfname-in handler keys)
+  (assert areapath "FATAL: areapath not provided for tt:start-server")
+  ;; is there already a server for this dbfile? Then exit.
+  (let* ((ttdat   (make-tt areapath: areapath))
+	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))))
+    ;; (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
+    ;; (if (null? servers)
+    (let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
+      (tt-handler-set! ttdat (handler dbstruct))
+      (let* ((tcp-thread (make-thread
+			  (lambda ()
+			    (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
+			  "tcp-server-thread"))
+	     (run-thread (make-thread
+			  (lambda ()
+			    (tt:keep-running ttdat dbfname dbstruct)))))
+	(thread-start! tcp-thread)
+	(thread-start! run-thread)
+	(thread-join! run-thread) ;; run thread will exit on timeout or other conditions
+	(exit)))
+    ;;(begin
+    ;; (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
+    ;; (exit)))))
+    ))
+
+(define (tt:keep-running ttdat dbfname dbstruct)
+  ;; verfiy conn for ready
+  ;; listener socket has been started by this stage
+  ;; wait for a port before creating the registration file
+  ;;
+  (let* ((cleanup (lambda ()
+		    (if (tt-cleanup-proc ttdat)
+			((tt-cleanup-proc ttdat))))))
+    (let loop ((count 0))
+      (if (> count 240)
+	  (begin
+	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
+	    (exit 1))
+	  (if (not (tt-port ttdat)) ;; no connection yet
+	      (begin
+		(thread-sleep! 0.25)
+		(loop (+ count 1))))))
+    
+    ;; load or reload the data into inmem db before
+    ;; ((dbr:dbstruct-sync-proc dbstruct) (dbr:dbstruct-last-update dbstruct))
+    ;; (dbr:dbstruct-last-update-set! dbstruct (- (current-seconds) 1))
+    (tt:create-server-registration-file ttdat dbfname)
+    ;; now start watching the last-access, if it hasn't been touched
+    ;; in over ten seconds we exit
+    (thread-sleep! 0.05) ;; any real need for delay here?
+    (let loop ()
+      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
+	     (ok      (cond
+		       ((null? servers) #f) ;; not ok
+		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
+				(tt-servinf-file ttdat))
+			(debug:print-info 0 *default-log-port* "Keep running, I'm the top server.")
+			#t)
+		       (else
+			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
+			(let* ((leadsrv (car servers)))
+			  (match leadsrv
+			    ((host port startseconds server-id pid dbfname servinfofile)
+			     (if (tt:ping host port server-id)
+				 #f ;; not the server, but all good, want to exit
+				 (if (and (file-exists? servinfofile)
+					  (> (- (current-seconds)(file-modification-time servinfofile)) 5))
+				     (begin
+				       ;; can't ping and file has been on disk 5 seconds, go ahead and try to remove it
+				       (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
+				       (delete-file* servinfofile)
+				       #t) ;; not the server but the server is not reachable
+				     #t)))
+			    (else ;; should never get here
+			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
+			     (assert #f "Bad server record "leadsrv))))))))
+	(if ok
+	    ;; (if (> *api-process-request-count* 0) ;; have requests in flight
+	    ;;	(tt-last-access-set! ttdat (current-seconds)))
+	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
+	    (begin
+	      (cleanup)
+	      (exit)))
+
+	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
+	       (curr-secs   (current-seconds)))
+	  (if (> (- curr-secs last-update) 3) ;; every 3-4 seconds update the db? maybe this should be refresh the inmem?
+	      (begin
+		((dbr:dbstruct-sync-proc dbstruct) last-update)
+		(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
+	  
+	(if (< (- (current-seconds) (tt-last-access ttdat)) 60)
+	    (begin
+	      (thread-sleep! 5)
+	      (loop)))))
+    (cleanup)
+    (debug:print 0 *default-log-port* "INFO: Server timed out, exiting.")))
+
+  
+;; ;; given an already set up uconn start the cmd-loop
+;; ;;
+;; (define (tt:cmd-loop ttdat)
+;;   (let* ((serv-listener (-socket uconn))
+;; 	 (listener      (lambda ()
+;; 			  (let loop ((state 'start))
+;; 			    (let-values (((inp oup)(tcp-accept serv-listener)))
+;; 			      ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+;; 			      (let* ((rdat  (deserialize inp)) ;; '(my-host-port qrykey cmd params)
+;; 				     (resp  (ulex-handler uconn rdat)))
+;; 				(serialize resp oup)
+;; 				(close-input-port inp)
+;; 				(close-output-port oup)
+;; 				;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+;; 				)
+;; 			      (loop state))))))
+;;     ;; start N of them
+;;     (let loop ((thnum   0)
+;; 	       (threads '()))
+;;       (if (< thnum 100)
+;; 	  (let* ((th (make-thread listener (conc "listener" thnum))))
+;; 	    (thread-start! th)
+;; 	    (loop (+ thnum 1)
+;; 		  (cons th threads)))
+;; 	  (map thread-join! threads)))))
+;; 
+;; 
+;; 
+;; (define (wait-and-close uconn)
+;;   (thread-join! (udat-cmd-thread uconn))
+;;   (tcp-close (udat-socket uconn)))
+;; 
+;; 
+
+(define (tt:shutdown-server ttdat)
+  (let* ((cleanproc (tt-cleanup-proc ttdat)))
+    (if cleanproc (cleanproc))
+    (tcp-close (tt-socket ttdat)) ;; close up ports here
+    ))
+
+;; (define (wait-and-close uconn)
+;;   (thread-join! (tt-cmd-thread uconn))
+;;   (tcp-close (tt-socket uconn)))
+
+;; return servid
+;; side-effects:
+;;   ttdat-cleanup-proc is populated with function to remove the serverinfo file
+(define (tt:create-server-registration-file ttdat dbfname)
+  (let* ((areapath (tt-areapath ttdat))
+	 (servdir  (tt:get-servinfo-dir areapath))
+	 (host     (tt-host ttdat))
+	 (port     (tt-port ttdat))
+	 (servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname))
+	 (serv-id (tt:mk-signature areapath))
+	 (clean-proc (lambda ()
+		       (delete-file* servinf))))
+    (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname)
+    (tt-cleanup-proc-set! ttdat clean-proc)
+    (tt-servinf-file-set! ttdat servinf)
+    (with-output-to-file servinf
+      (lambda ()
+	(print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname)))
+      serv-id))
+
+;; find valid server
+;; get servers listed, last part of name must match :<dbfname>
+;; if more than one, wait one second and look again
+;; future: ping oldest, if alive remove other :<dbfname> files
+;;
+(define (tt:find-server areapath dbfname)
+  (let* ((servdir  (tt:get-servinfo-dir areapath))
+	 (sfiles   (glob (conc servdir"/*:"dbfname))))
+    sfiles))
+
+;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
+;; example of what it's looking for in the log file:
+;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
+;;
+(define (tt:server-get-info logf)
+  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
+        (dbprep-rx    (regexp "^SERVER: dbprep"))
+        (dbprep-found 0)
+	(bad-dat      (list #f #f #f #f #f #f logf)))
+     (let ((fdat     (handle-exceptions
+			 exn
+		       (begin
+			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
+			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn=" exn)
+			 '()) ;; no idea what went wrong, call it a bad server, return empty list
+		       (with-input-from-file logf read-lines))))
+       (if (null? fdat) ;; bad data, return bad-dat
+	   bad-dat
+	   (let loop ((inl  (car fdat))
+		      (tail (cdr fdat))
+		      (lnum 0))
+	     (let ((mlst (string-match server-rx inl))
+		   (dbprep (string-match dbprep-rx inl)))
+	       (if dbprep (set! dbprep-found 1))
+	       (if (not mlst)
+		   (if (> lnum 500) ;; give up if more than 500 lines of server log read
+		       bad-dat
+		       (if (null? tail)
+			   bad-dat
+			   (loop (car tail)(cdr tail)(+ lnum 1))))
+		   (match mlst ;; have a not null list
+		     ((_ host port start server-id pid dbfname)
+		      (list host
+			    (string->number port)
+			    (string->number start)
+			    server-id
+			    (string->number pid)
+			    dbfname
+			    logf))
+		     (else
+		      (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
+		      bad-dat)))))))))
+
+;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
+;; if the target-host is set 
+;; try running on that host
+;;   incidental: rotate logs in logs/ dir.
+;;
+(define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
+  (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
+  (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
+  (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
+  ;; mtest -server - -m testsuite:ext-tests -db 6.db
+  (let* ((dbfname  (dbmod:run-id->dbfname run-id))
+	 (load     (get-normalized-cpu-load))
+	 (nrun     (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
+    (cond
+     ((> load 2.0)
+      (debug:print 0 *default-log-port* "Normalized load "load" is over the limit of 2.0. Not starting a server.")
+      (thread-sleep! 1))
+     ((> nrun 100)
+      (debug:print 0 *default-log-port* nrun" servers running on this host, not starting another.")
+      (thread-sleep! 1))
+     (else
+      (if (not (file-exists? (conc areapath"/logs")))
+	      (create-directory (conc areapath"/logs") #t))
+	  (let* ((logfile   (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
+		 (cmdln     (conc
+			     mtexe
+			     " -server - ";; (or target-host "-")
+			     " -m testsuite:" testsuite
+			     ;; " -run-id " (or run-id "main") ;; NO, we do NOT want to have run id as part of this
+			     " -db "  dbfname ;; (dbmod:run-id->dbfname run-id)
+			     " " profile-mode
+			     ))) ;; (conc " >> " logfile " 2>&1 &")))))
+	    ;; we want the remote server to start in *toppath* so push there
+	    ;; (push-directory areapath) ;; use cd in the command line instead
+	    (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
+	    ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
+	    (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
+	    (setenv "NBFAKE_LOG" logfile)
+	    (system (conc "cd "areapath" ; nbfake " cmdln))
+	    (unsetenv "NBFAKE_QUIET")
+	    (unsetenv "NBFAKE_LOG")
+	    ;;(pop-directory)
+	    )))))
+
+;;======================================================================
+;; tcp connection stuff
+;;======================================================================
+
+;; find a port and start tcp-server. This only starts the tcp portion of
+;; the server, look at (tt:start-server ...) above for the entry point
+;; for the entire server system
+;;
+(define (tt:start-tcp-server ttdat)
+  (setup-listener ttdat)
+  (let* ((socket   (tt-socket  ttdat))
+	 (handler  (tt-handler ttdat)))
+    ((make-tcp-server socket handler)
+     #f ;; yes, send error messages to std-err
+     )))
+
+;; create a tcp listener and return a populated udat struct with
+;; my port, address, hostname, pid etc.
+;; return #f if fail to find a port to allocate.
+;;
+;;  if udata-in is #f create the record
+;;  if there is already a serv-listener return the udata
+;;
+(define (setup-listener uconn #!optional (port 4242))
+  (assert (tt? uconn) "FATAL: setup-listener called with wrong struct "uconn)
+  (handle-exceptions
+   exn
+   (if (< port 65535)
+       (setup-listener uconn (+ port 1))
+       #f)
+   (connect-listener uconn port)))
+
+(define (connect-listener uconn port)
+  ;; (tcp-listener-socket LISTENER)(socket-name so)
+  ;; sockaddr-address, sockaddr-port, sockaddr->string
+  (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
+	 (addr  (tt:get-best-guess-address (get-host-name)))) ;; (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))
+    (tt-port-set!      uconn port)
+    (tt-host-set!      uconn addr)
+    (tt-host-port-set! uconn (conc addr":"port))
+    (tt-socket-set!    uconn tlsn)
+    uconn))
+
+;;======================================================================
+;; utils
+;;======================================================================
+
+;; Generate a unique signature for this server
+(define (tt:mk-signature areapath)
+  (message-digest-string (md5-primitive) 
+			 (with-output-to-string
+			   (lambda ()
+			     (write (list areapath
+                                          (current-process-id)
+					  (argv)))))))
+
+
+(define (tt:get-best-guess-address hostname)
+  (let ((res #f))
+    (for-each 
+     (lambda (adr)
+       (if (not (eq? (u8vector-ref adr 0) 127))
+	   (set! res adr)))
+     ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME
+     (vector->list (hostinfo-addresses (hostname->hostinfo hostname))))
+    (string-intersperse 
+     (map number->string
+	  (u8vector->list
+	   (if res res (hostname->ip hostname)))) ".")))
+
+(define (tt:get-servinfo-dir areapath)
+  (let* ((spath (conc areapath"/.servinfo")))
+    (if (not (file-exists? spath))
+	(create-directory spath #t))
+    spath))
+
+;;======================================================================
+;; network utilities
+;;======================================================================
+
+;; NOTE: Look at address-info egg as alternative to some of this
+
+(define (rate-ip ipaddr)
+  (regex-case ipaddr
+    ( "^127\\..*" _ 0 )
+    ( "^(10\\.0|192\\.168)\\..*" _ 1 )
+    ( else 2 ) ))
+
+;; Change this to bias for addresses with a reasonable broadcast value?
+;;
+(define (ip-pref-less? a b)
+  (> (rate-ip a) (rate-ip b)))
+
+(define (get-my-best-address)
+  (let ((all-my-addresses (get-all-ips)))
+    (cond
+     ((null? all-my-addresses)
+      (get-host-name))                                          ;; no interfaces?
+     ((eq? (length all-my-addresses) 1)
+      (car all-my-addresses))                      ;; only one to choose from, just go with it
+     (else
+      (car (sort all-my-addresses ip-pref-less?))))))
+
+(define (get-all-ips-sorted)
+  (sort (get-all-ips) ip-pref-less?))
+
+(define (get-all-ips)
+  (map address-info-host
+       (filter (lambda (x)
+		 (equal? (address-info-type x) "tcp"))
+	       (address-infos (get-host-name)))))
+
+)

Index: tdb.scm
==================================================================
--- tdb.scm
+++ tdb.scm
@@ -32,10 +32,12 @@
 (declare (uses keys))
 (declare (uses ods))
 (declare (uses client))
 (declare (uses mt))
 (declare (uses db))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 (include "db_records.scm")
 (include "key_records.scm")
 (include "run_records.scm")

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1966,13 +1966,13 @@
 ;;======================================================================
 
 ;; teststep-set-status! used to be here
 
 (define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
-  (let* ((testdat   (rmt:get-test-info-by-id run-id test-id)))
+  (let* ((testdat   (rmt:get-test-state-status-by-id run-id test-id)))
     (and testdat
-	 (equal? (test:get-state testdat) "KILLREQ"))))
+	 (equal? (car testdat) "KILLREQ"))))
 
 (define (test:tdb-get-rundat-count tdb)
   (if tdb
       (let ((res 0))
 	(sqlite3:for-each-row

ADDED   transport-mode.scm.template
Index: transport-mode.scm.template
==================================================================
--- /dev/null
+++ transport-mode.scm.template
@@ -0,0 +1,3 @@
+;; 'http or 'tcp
+(rmt:transport-mode 'tcp)
+;; (rmt:transport-mode 'http)

ADDED   ulex/dbmgr.scm
Index: ulex/dbmgr.scm
==================================================================
--- /dev/null
+++ ulex/dbmgr.scm
@@ -0,0 +1,1131 @@
+;;======================================================================
+;; Copyright 2022, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+;;======================================================================
+
+(declare (unit dbmgrmod))
+
+(declare (uses ulex))
+(declare (uses apimod))
+(declare (uses pkts))
+(declare (uses commonmod))
+(declare (uses dbmod))
+(declare (uses mtargs))
+(declare (uses portloggermod))
+(declare (uses debugprint))
+
+(module dbmgrmod
+    *
+
+(import scheme
+	chicken.base
+	chicken.condition
+	chicken.file
+	chicken.format
+	chicken.port
+	chicken.process
+	chicken.process-context
+	chicken.process-context.posix
+	chicken.sort
+	chicken.string
+	chicken.time
+	
+	(prefix sqlite3 sqlite3:)
+	matchable
+	md5
+	message-digest
+	regex
+	s11n
+	srfi-1
+	srfi-18
+	srfi-69
+	system-information
+	typed-records
+	
+	pkts
+	ulex
+
+	commonmod
+	apimod
+	dbmod
+	debugprint
+	(prefix mtargs args:)
+	portloggermod
+	)
+
+;; Configurations for server
+;; (tcp-buffer-size 2048)
+;; (max-connections 2048) 
+
+;; info about me as a listener and my connections to db servers
+;; stored (for now) in *db-serv-info*
+;;
+(defstruct servdat
+  (host #f)
+  (port #f)
+  (uuid #f)
+  (dbfile #f)
+  (uconn   #f) ;; this is the listener *FOR THIS PROCESS*
+  (mode    #f)
+  (status 'starting)
+  (trynum 0) ;; count the number of ports we've tried
+  (conns  (make-hash-table)) ;; apath/dbname => conndat
+  ) 
+
+(define *db-serv-info* (make-servdat))
+
+(define (servdat->url sdat)
+  (conc (servdat-host sdat)":"(servdat-port sdat)))
+
+;; db servers contact info
+;;
+(defstruct conndat
+  (apath    #f)
+  (dbname   #f)
+  (fullname #f)
+  (hostport #f)
+  (ipaddr   #f)
+  (port     #f)
+  (srvpkt   #f)
+  (srvkey   #f)
+  (lastmsg  0)
+  (expires  0))
+
+(define *srvpktspec*
+  `((server (host    . h)
+	    (port    . p)
+	    (servkey . k)
+	    (pid     . i)
+	    (ipaddr  . a)
+	    (dbpath  . d))))
+
+;;======================================================================
+;;  S U P P O R T   F U N C T I O N S
+;;======================================================================
+
+;; set up the api proc, seems like there should be a better place for this?
+;;
+;; IS THIS NEEDED ANYMORE? TODO - REMOVE IF POSSIBLE
+;;
+;; (define api-proc (make-parameter conc))
+;; (api-proc api:execute-requests)
+
+;; do we have a connection to apath dbname and
+;; is it not expired? then return it
+;;
+;; else setup a connection
+;;
+;; if that fails, return '(#f "some reason") ;; NB// convert to raising an exception
+;;
+(define (rmt:get-conn remdat apath dbname)
+  (let* ((fullname (db:dbname->path apath dbname)))
+    (hash-table-ref/default (servdat-conns remdat) fullname #f)))
+
+(define (rmt:drop-conn remdat apath dbname)
+  (let* ((fullname (db:dbname->path apath dbname)))
+    (hash-table-delete! (servdat-conns remdat) fullname)))
+  
+(define (rmt:find-main-server uconn apath dbname)
+  (let* ((pktsdir     (get-pkts-dir apath))
+	 (all-srvpkts (get-all-server-pkts pktsdir *srvpktspec*))
+	 (viable-srvs (get-viable-servers all-srvpkts dbname)))
+    (get-the-server uconn apath viable-srvs)))
+
+
+(define *connstart-mutex* (make-mutex))
+(define *last-main-start* 0)
+
+;; looks for a connection to main, returns if have and not exired
+;; creates new otherwise
+;; 
+;; connections for other servers happens by requesting from main
+;;
+;; TODO: This is unnecessarily re-creating the record in the hash table
+;;
+(define (rmt:open-main-connection remdat apath)
+  (let* ((fullpath (db:dbname->path apath ".db/main.db"))
+	 (conns    (servdat-conns remdat))
+	 (conn     (rmt:get-conn remdat apath ".db/main.db")) ;; (hash-table-ref/default conns fullpath #f)) ;; TODO - create call for this
+	 (start-rmt:run (lambda ()
+			  (let* ((th1 (make-thread (lambda ()(rmt:run (get-host-name))) "non-db mode server")))
+			    (thread-start! th1)
+			    (thread-sleep! 1)
+			    (let loop ((count 0))
+			      (assert (< count 30) "FATAL: responder failed to initialize in rmt:open-main-connection")
+			      (if (or (not *db-serv-info*)
+				      (not (servdat-uconn *db-serv-info*)))
+				  (begin
+				    (thread-sleep! 1)
+				    (loop (+ count 1)))
+				  (begin
+				    (servdat-mode-set! *db-serv-info* 'non-db)
+				    (servdat-uconn *db-serv-info*)))))))
+	 (myconn    (servdat-uconn *db-serv-info*)))
+    (cond
+     ((not myconn)
+      (start-rmt:run)
+      (rmt:open-main-connection remdat apath))
+     ((and conn                                             ;; conn is NOT a socket, just saying ...
+	   (< (current-seconds) (conndat-expires conn)))
+      #t) ;; we are current and good to go - we'll deal elsewhere with a server that was killed or died 
+     ((and conn
+	   (>= (current-seconds)(conndat-expires conn)))
+      (debug:print-info 0 *default-log-port* "connection to "fullpath" server expired. Reconnecting.")
+      (rmt:drop-conn remdat apath ".db/main.db") ;;
+      (rmt:open-main-connection remdat apath))
+     (else
+      ;; Below we will find or create and connect to main
+      (debug:print-info 0 *default-log-port* "rmt:open-main-connection - starting from scratch")
+      (let* ((dbname         (db:run-id->dbname #f))
+	     (the-srv        (rmt:find-main-server myconn apath dbname))
+	     (start-main-srv (lambda () ;; call IF there is no the-srv found
+			       (mutex-lock! *connstart-mutex*)
+			       (if (> (- (current-seconds) *last-main-start*) 5) ;; at least four seconds since last attempt to start main server
+				   (begin
+				     (api:run-server-process apath dbname)
+				     (set! *last-main-start* (current-seconds))
+				     (thread-sleep! 1))
+				   (thread-sleep! 0.25))
+			       (mutex-unlock! *connstart-mutex*)
+			       (rmt:open-main-connection remdat apath) ;; TODO: Add limit to number of tries
+			       )))
+	(if (not the-srv) ;; have server, try connecting to it
+	    (start-main-srv)
+	    (let* ((srv-addr (server-address the-srv)) ;; need serv
+		   (ipaddr   (alist-ref 'ipaddr  the-srv))
+		   (port     (alist-ref 'port    the-srv))
+		   (srvkey   (alist-ref 'servkey the-srv))
+		   (fullpath (db:dbname->path apath dbname))
+		   
+		   (new-the-srv (make-conndat
+				 apath:   apath
+				 dbname:  dbname
+				 fullname: fullpath
+				 hostport: srv-addr
+				 ;; socket: (open-nn-connection srv-addr)  - TODO - open ulex connection?
+				 ipaddr: ipaddr
+				 port: port
+				 srvpkt: the-srv
+				 srvkey: srvkey ;; generated by rmt:get-signature on the server side
+				 lastmsg: (current-seconds)
+				 expires: (+ (current-seconds)
+					     (server:expiration-timeout)
+					     -2) ;; this needs to be gathered during the ping
+				 )))
+	      (hash-table-set! conns fullpath new-the-srv)))
+	#t)))))
+
+;; NB// sinfo is a servdat struct
+;;
+(define (rmt:general-open-connection sinfo apath dbname #!key (num-tries 5))
+  (assert (not (equal? dbname ".db/main.db")) "ERROR: general-open-connection should never be called with main as the db")
+  (let* ((mdbname  ".db/main.db") ;; (db:run-id->dbname #f)) TODO: put this back to the lookup when stable
+	 (fullname (db:dbname->path apath dbname))
+	 (conns    (servdat-conns sinfo))
+	 (mconn    (rmt:get-conn sinfo apath ".db/main.db"))
+	 (dconn    (rmt:get-conn sinfo apath dbname)))
+    #;(if (and mconn
+	     (not (debug:print-logger)))
+	(begin
+	  (debug:print-info 0 *default-log-port* "Turning on logging to main, look in logs dir for main log.")
+	  (debug:print-logger rmt:log-to-main)))
+    (cond
+     ((and mconn
+	   dconn
+	   (< (current-seconds)(conndat-expires dconn)))
+      #t) ;; good to go
+     ((not mconn) ;; no channel open to main? open it...
+      (rmt:open-main-connection sinfo apath)
+      (rmt:general-open-connection sinfo apath dbname num-tries: (- num-tries 1)))
+     ((not dconn)                 ;; no channel open to dbname?     
+      (let* ((res (rmt:send-receive-real sinfo apath mdbname 'get-server `(,apath ,dbname))))
+	(case res
+	  ((server-started)
+	   (if (> num-tries 0)
+	       (begin
+		 (thread-sleep! 2)
+		 (rmt:general-open-connection sinfo apath dbname num-tries: (- num-tries 1)))
+	       (begin
+		 (debug:print-error 0 *default-log-port* "Failed to start servers needed or open channel to "apath", "dbname)
+		 (exit 1))))
+	  (else
+	   (if (list? res) ;; server has been registered and the info was returned. pass it on.
+	       (begin ;;  ("192.168.0.9" 53817
+		      ;;  "5e34239f48e8973b3813221e54701a01" "24310"
+		      ;;  "192.168.0.9"
+		      ;;  "/home/matt/data/megatest/tests/simplerun"
+		 ;;  ".db/1.db")
+		 (match
+		  res
+		  ((host port servkey pid ipaddr apath dbname)
+		   (debug:print-info 0 *default-log-port* "got "res)
+		   (hash-table-set! conns
+				    fullname
+				    (make-conndat
+				     apath: apath
+				     dbname: dbname
+				     hostport: (conc host":"port)
+				     ;; socket: (open-nn-connection (conc host":"port)) ;; TODO - open ulex connection?
+				     ipaddr: ipaddr
+				     port: port
+				     srvkey: servkey
+				     lastmsg: (current-seconds)
+				     expires: (+ (current-seconds)
+						 (server:expiration-timeout)
+						 -2))))
+		  (else
+		   (debug:print-info 0 *default-log-port* "return data from starting server did not match host port servkey pid ipaddr apath dbname " res)))
+		 res)
+	       (begin
+		 (debug:print-info 0 *default-log-port* "Unexpected result: " res)
+		 res)))))))
+    #t))
+
+;;======================================================================
+
+;; FOR DEBUGGING SET TO #t
+;; (define *localmode* #t)
+(define *localmode* #f)
+(define *dbstruct* (make-dbr:dbstruct))
+
+;; Defaults to current area
+;;
+(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f))
+  (let* ((apath      *toppath*)
+	 (sinfo      *db-serv-info*)
+	 (dbname     (db:run-id->dbname rid)))
+    (if *localmode*
+	(api:execute-requests *dbstruct* cmd params)
+	(begin
+	  (rmt:open-main-connection sinfo apath)
+	  (if rid (rmt:general-open-connection sinfo apath dbname))
+	  #;(if (not (member cmd '(log-to-main)))
+	      (debug:print-info 0 *default-log-port* "rmt:send-receive "cmd" params="params))
+	  (rmt:send-receive-real sinfo apath dbname cmd params)))))
+
+;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed
+;; sometime in the future
+;;
+(define (rmt:send-receive-real sinfo apath dbname cmd params)
+  (assert (not (eq? 'primordial (thread-name (current-thread)))) "FATAL: Do not call rmt:send-receive-real in the primodial thread.")
+  (let* ((cdat (rmt:get-conn sinfo apath dbname)))
+    (assert cdat "FATAL: rmt:send-receive-real called without the needed channels opened")
+    (let* ((uconn    (servdat-uconn sinfo)) ;; get the interface to ulex
+	   ;; then send-receive using the ulex layer to host-port stored in cdat
+	   (res      (send-receive uconn (conndat-hostport cdat) cmd params))
+	   #;(th1      (make-thread (lambda ()
+				    (set! res (send-receive uconn (conndat-hostport cdat) cmd params)))
+				  "send-receive thread")))
+      ;; (thread-start! th1)
+      ;; (thread-join! th1)   ;; gratuitious thread stuff is so that mailbox is not used in primordial thead
+      ;; since we accessed the server we can bump the expires time up
+      (conndat-expires-set! cdat (+ (current-seconds)
+				    (server:expiration-timeout)
+				    -10)) ;; ten second margin for network time misalignments etc.
+      res)))
+
+;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed
+;; sometime in the future.
+;;
+;; Purpose - call the main.db server and request a server be started
+;; for the given area path and dbname
+;;
+
+(define (rmt:print-db-stats)
+  (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
+    (debug:print 18 *default-log-port* "DB Stats, "(seconds->year-week/day-time (current-seconds))"\n=====================")
+    (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
+    (for-each (lambda (cmd)
+		(let ((cmd-dat (hash-table-ref *db-stats* cmd)))
+		  (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0))))))
+	      (sort (hash-table-keys *db-stats*)
+		    (lambda (a b)
+		      (> (vector-ref (hash-table-ref *db-stats* a) 0)
+			 (vector-ref (hash-table-ref *db-stats* b) 0)))))))
+
+(define (rmt:get-max-query-average run-id)
+  (mutex-lock! *db-stats-mutex*)
+  (let* ((runkey (conc "run-id=" run-id " "))
+	 (cmds   (filter (lambda (x)
+			   (substring-index runkey x))
+			 (hash-table-keys *db-stats*)))
+	 (res    (if (null? cmds)
+		     (cons 'none 0)
+		     (let loop ((cmd (car cmds))
+				(tal (cdr cmds))
+				(max-cmd (car cmds))
+				(res 0))
+		       (let* ((cmd-dat (hash-table-ref *db-stats* cmd))
+			      (tot     (vector-ref cmd-dat 0))
+			      (curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction
+			      (currmax (max res curravg))
+			      (newmax-cmd (if (> curravg res) cmd max-cmd)))
+			 (if (null? tal)
+			     (if (> tot 10)
+				 (cons newmax-cmd currmax)
+				 (cons 'none 0))
+			     (loop (car tal)(cdr tal) newmax-cmd currmax)))))))
+    (mutex-unlock! *db-stats-mutex*)
+    res))
+
+;; host and port are used to ensure we are remove proper records
+(define (rmt:server-shutdown host port)
+  (let ((dbfile   (servdat-dbfile *db-serv-info*)))
+    (debug:print-info 0 *default-log-port* "dbfile is "dbfile)
+    (if dbfile
+	(let* ((am-server  (args:get-arg "-server"))
+	       (dbfile     (args:get-arg "-db"))
+	       (apath      *toppath*)
+	       #;(sinfo     *remotedat*)) ;; foundation for future fix
+	  (if *dbstruct-db*
+	      (let* ((dbdat      (db:get-dbdat *dbstruct-db* apath dbfile))
+		     (db         (dbr:dbdat-db dbdat))
+		     (inmem      (dbr:dbdat-db dbdat))   ;; WRONG
+		     )
+		;; do a final sync here
+		(debug:print-info 0 *default-log-port* "Doing final sync for "apath" "dbfile" at "(current-seconds))
+		(db:sync-inmem->disk *dbstruct-db* apath dbfile force-sync: #t)
+		;; let's finalize here
+		(debug:print-info 0 *default-log-port* "Finalizing db and inmem")
+		(if (sqlite3:database? db)
+		    (sqlite3:finalize! db)
+		    (debug:print-info 0 *default-log-port* "in rmt:server-shutdown, db is not a database, not finalizing..."))
+		(if (sqlite3:database? inmem)
+		    (sqlite3:finalize! inmem)
+		    (debug:print-info 0 *default-log-port* "in rmt:server-shutdown, inmem is not a database, not finalizing..."))
+		(debug:print-info 0 *default-log-port* "Finalizing db and inmem complete"))
+	      (debug:print-info 0 *default-log-port* "Db was never opened, no cleanup to do."))
+	  (if (not am-server)
+	      (debug:print-info 0 *default-log-port* "I am not a server, should NOT get here!")
+	      (if (string-match ".*/main.db$" dbfile)
+		  (let ((pkt-file (conc (get-pkts-dir *toppath*)
+					"/" (servdat-uuid *db-serv-info*)
+					".pkt")))
+		    (debug:print-info 0 *default-log-port* "removing pkt "pkt-file)
+		    (delete-file* pkt-file)
+		    (debug:print-info 0 *default-log-port* "Releasing lock (if any) for "dbfile ", host "host", port "port)
+		    (db:with-lock-db
+		     (servdat-dbfile *db-serv-info*)
+		     (lambda (dbh dbfile)
+		       (db:release-lock dbh dbfile host port)))) ;; I'm not the server - should not have a lock to remove
+		  (let* ((sdat *db-serv-info*) ;; we have a run-id server
+			 (host (servdat-host sdat))
+			 (port (servdat-port sdat))
+			 (uuid (servdat-uuid sdat))
+			 (res  (rmt:deregister-server *db-serv-info* *toppath* host port uuid dbfile)))
+		    (debug:print-info 0 *default-log-port* "deregistered-server, res="res)
+		    (debug:print-info 0 *default-log-port* "deregistering server "host":"port" with uuid "uuid)
+		    )))))))
+
+
+(define (common:run-sync?)
+    ;; (and (common:on-homehost?)
+  (args:get-arg "-server"))
+
+(define *rmt:run-mutex* (make-mutex))
+(define *rmt:run-flag* #f)
+
+;; Main entry point to start a server. was start-server
+(define (rmt:run hostn)
+  (mutex-lock! *rmt:run-mutex*)
+  (if *rmt:run-flag*
+      (begin
+	(debug:print-warn 0 *default-log-port* "rmt:run already running.")
+	(mutex-unlock! *rmt:run-mutex*))
+      (begin
+	(set! *rmt:run-flag* #t)
+	(mutex-unlock! *rmt:run-mutex*)
+	;;  ;; Configurations for server
+	;;  (tcp-buffer-size 2048)
+	;;  (max-connections 2048) 
+	(debug:print 2 *default-log-port* "PID: "(current-process-id)". Attempting to start the server ...")
+	(if (and *db-serv-info*
+		 (servdat-uconn *db-serv-info*))
+	    (let* ((uconn (servdat-uconn *db-serv-info*)))
+	      (wait-and-close uconn))
+	    (let* ((port            (portlogger:open-run-close portlogger:find-port))
+		   (handler-proc    (lambda (rem-host-port qrykey cmd params) ;;
+				      (set! *db-last-access* (current-seconds))
+				      (assert (list? params) "FATAL: handler called with non-list params")
+				      (assert (args:get-arg "-server") "FATAL: handler called on non-server side. cmd="cmd", params="params)
+				      (debug:print 0 *default-log-port* "handler call: "cmd", params="params)
+				      (api:execute-requests *dbstruct-db* cmd params))))
+	      ;; (api:process-request *dbstuct-db* 
+	      (if (not *db-serv-info*)
+		  (set! *db-serv-info* (make-servdat host: hostn port: port)))
+	      (let* ((uconn (run-listener handler-proc port))
+		     (rport (udat-port uconn))) ;; the real port
+		(servdat-host-set! *db-serv-info* hostn)
+		(servdat-port-set! *db-serv-info* rport)
+		(servdat-uconn-set! *db-serv-info* uconn)
+		(wait-and-close uconn)
+		(db:print-current-query-stats)
+		)))
+	(let* ((host (servdat-host *db-serv-info*))
+	       (port (servdat-port *db-serv-info*))
+	       (mode (or (servdat-mode *db-serv-info*)
+			 "non-db")))
+	  ;; server exit stuff here
+	  ;; (rmt:server-shutdown host port) - always do in on-exit
+	  ;; (portlogger:open-run-close portlogger:set-port port "released") ;; moved to on-exit 
+	  (debug:print-info 0 *default-log-port* "Server "host":"port" mode "mode"shutdown complete. Exiting")
+	  ))))
+
+;;======================================================================
+;; S E R V E R   U T I L I T I E S 
+;;======================================================================
+
+
+;;======================================================================
+;; NEW SERVER METHOD
+;;======================================================================
+
+;; only use for main.db - need to re-write some of this :(
+;;
+(define (get-lock-db sdat dbfile host port)
+  (assert host "FATAL: get-lock-db called with host not set.")
+  (assert port "FATAL: get-lock-db called with port not set.")
+  (let* ((dbh (db:open-run-db dbfile db:initialize-db)) ;; open-run-db creates a standard db with schema used by all situations
+	 (res (db:get-iam-server-lock dbh dbfile host port))
+	 (uconn (servdat-uconn sdat)))
+    ;; res => list then already locked, check server is responsive
+    ;;     => #t then sucessfully got the lock
+    ;;     => #f reserved for future use as to indicate something went wrong
+    (match res
+      ((owner_pid owner_host owner_port event_time)
+       (if (server-ready? uconn (conc owner_host":"owner_port) "abc")
+	   #f      ;; locked by someone else
+	   (begin  ;; locked by someone dead and gone
+	     (debug:print 0 *default-log-port* "WARNING: stale lock - have to steal it. This may fail.")
+	     (db:steal-lock-db dbh dbfile port))))
+      (#t  #t) ;; placeholder so that we don't touch res if it is #t
+      (else (set! res #f)))
+    (sqlite3:finalize! dbh)
+    res))
+
+
+(define (register-server pkts-dir pkt-spec host port servkey ipaddr dbpath)
+  (let* ((pkt-dat `((host    . ,host)
+		    (port    . ,port)
+		    (servkey . ,servkey)
+		    (pid     . ,(current-process-id))
+		    (ipaddr  . ,ipaddr)
+		    (dbpath  . ,dbpath)))
+	 (uuid    (write-alist->pkt
+		   pkts-dir
+		   pkt-dat
+		   pktspec: pkt-spec
+		   ptype: 'server)))
+    (debug:print 0 *default-log-port* "Server on "host":"port" registered in pkt "uuid)
+    uuid))
+
+(define (get-pkts-dir #!optional (apath #f))
+  (let* ((effective-toppath (or *toppath* apath)))
+    (assert effective-toppath
+	    "ERROR: get-pkts-dir called without *toppath* set. Exiting.")
+    (let* ((pdir (conc effective-toppath "/.meta/srvpkts")))
+      (if (file-exists? pdir)
+	  pdir
+	  (begin
+	    (handle-exceptions ;; this exception handler should NOT be needed but ...
+		exn
+		pdir
+	      (create-directory pdir #t))
+	    pdir)))))
+
+;; given a pkts dir read 
+;;
+(define (get-all-server-pkts pktsdir-in pktspec)
+  (let* ((pktsdir  (if (file-exists? pktsdir-in)
+		       pktsdir-in
+		       (begin
+			 (create-directory pktsdir-in #t)
+			 pktsdir-in)))
+	 (all-pkt-files (glob (conc pktsdir "/*.pkt"))))
+    (map (lambda (pkt-file)
+	   (read-pkt->alist pkt-file pktspec: pktspec))
+	 all-pkt-files)))
+
+(define (server-address srv-pkt)
+  (conc (alist-ref 'host srv-pkt) ":"
+	(alist-ref 'port srv-pkt)))
+	
+(define (server-ready? uconn host-port key) ;; server-address is host:port
+  (let* ((params `((cmd . ping)(key . ,key)))
+	 (data `((cmd . ping)
+		 (key . ,key)
+		 (params . ,params))) ;; I don't get it.
+	 (res  (send-receive uconn host-port 'ping data)))
+    (if (eq? res 'ack) ;; yep, likely it is who we want on the other end
+	res
+	#f)))
+;; (begin (debug:print-info 0 *default-log-port* "server-ready? => "res) #f))))
+
+; from the pkts return servers associated with dbpath
+;; NOTE: Only one can be alive - have to check on each
+;;       in the list of pkts returned
+;;
+(define (get-viable-servers serv-pkts dbpath)
+  (let loop ((tail serv-pkts)
+	     (res  '()))
+    (if (null? tail)
+	res ;; NOTE: sort by age so oldest is considered first
+	(let* ((spkt (car tail)))
+	  (loop (cdr tail)
+		(if (equal? dbpath (alist-ref 'dbpath spkt))
+		    (cons spkt res)
+		    res))))))
+
+(define (remove-pkts-if-not-alive uconn serv-pkts)
+  (filter (lambda (pkt)
+	    (let* ((host (alist-ref 'host pkt))
+		   (port (alist-ref 'port pkt))
+		   (host-port (conc host":"port))
+		   (key  (alist-ref 'servkey  pkt))
+		   (pktz (alist-ref 'Z        pkt))
+		   (res  (server-ready? uconn host-port key)))
+	      (if res
+		  res
+		  (let* ((pktsdir (get-pkts-dir *toppath*))
+			 (pktpath (conc pktsdir"/"pktz".pkt")))
+		    (debug:print 0 *default-log-port* "WARNING: pkt with no server "pktpath)
+		    (delete-file* pktpath)
+		    #f))))
+	  serv-pkts))
+
+;; from viable servers get one that is alive and ready
+;;
+(define (get-the-server uconn apath serv-pkts)
+  (let loop ((tail serv-pkts))
+    (if (null? tail)
+	#f
+	(let* ((spkt  (car tail))
+	       (host  (alist-ref 'ipaddr spkt))
+	       (port  (alist-ref 'port spkt))
+	       (host-port (conc host":"port))
+	       (dbpth (alist-ref 'dbpath spkt))
+	       (srvkey (alist-ref 'Z spkt)) ;; (alist-ref 'srvkey spkt))
+	       (addr  (server-address spkt)))
+	  (if (server-ready? uconn host-port srvkey)
+	      spkt
+	      (loop (cdr tail)))))))
+
+;; am I the "first" in line server? I.e. my D card is smallest
+;; use Z card as tie breaker
+;;
+(define (get-best-candidate serv-pkts dbpath)
+  (if (null? serv-pkts)
+      #f
+      (let loop ((tail serv-pkts)
+		 (best  (car serv-pkts)))
+	(if (null? tail)
+	    best
+	    (let* ((candidate (car tail))
+		   (candidate-bd (string->number (alist-ref 'D candidate)))
+		   (best-bd      (string->number (alist-ref 'D best)))
+		   ;; bigger number is younger
+		   (candidate-z  (alist-ref 'Z candidate))
+		   (best-z       (alist-ref 'Z best))
+		   (new-best     (cond
+				  ((> best-bd candidate-bd) ;; best is younger than candidate
+				   candidate)
+				  ((< best-bd candidate-bd) ;; candidate is younger than best
+				   best)
+				  (else
+				   (if (string>=? best-z candidate-z)
+				       best
+				       candidate))))) ;; use Z card as tie breaker
+	      (if (null? tail)
+		  new-best
+		  (loop (cdr tail) new-best)))))))
+	  
+
+;;======================================================================
+;; END NEW SERVER METHOD
+;;======================================================================
+
+;; if .db/main.db check the pkts
+;; 
+(define (rmt:wait-for-server pkts-dir db-file server-key)
+  (let* ((sdat *db-serv-info*))
+    (let loop ((start-time (current-seconds))
+	       (changed    #t)
+	       (last-sdat  "not this"))
+      (begin ;; let ((sdat #f))
+	(thread-sleep! 0.01)
+	(debug:print-info 0 *default-log-port* "Waiting for server alive signature")
+	(mutex-lock! *heartbeat-mutex*)
+	(set! sdat *db-serv-info*)
+	(mutex-unlock! *heartbeat-mutex*)
+	(if (and sdat
+		 (not changed)
+		 (> (- (current-seconds) start-time) 2))
+	    (let* ((uconn (servdat-uconn sdat)))
+	      (servdat-status-set! sdat 'iface-stable)
+	      (debug:print-info 0 *default-log-port* "Received server alive signature, now attempting to lock in server")
+	      ;; create a server pkt in *toppath*/.meta/srvpkts
+	      
+	      ;; TODO:
+	      ;;   1. change sdat to stuct
+	      ;;   2. add uuid to struct
+	      ;;   3. update uuid in sdat here
+	      ;;
+	      (servdat-uuid-set! sdat
+				 (register-server
+				  pkts-dir *srvpktspec*
+				  (get-host-name)
+				  (servdat-port sdat) server-key
+				  (servdat-host sdat) db-file))
+	      ;; (set! *my-signature* (servdat-uuid sdat)) ;; replace with Z, no, stick with proper key
+	      ;; now read pkts and see if we are a contender
+	      (let* ((all-pkts     (get-all-server-pkts pkts-dir *srvpktspec*))
+		     (viables      (get-viable-servers all-pkts db-file))
+		     (alive        (remove-pkts-if-not-alive uconn viables))
+		     (best-srv     (get-best-candidate alive db-file))
+		     (best-srv-key (if best-srv (alist-ref 'servkey best-srv) #f))
+		     (i-am-srv     (equal? best-srv-key server-key))
+		     (delete-pkt   (lambda ()
+				     (let* ((pktfile (conc (get-pkts-dir *toppath*)
+							 "/" (servdat-uuid *db-serv-info*)
+							 ".pkt")))
+				       (debug:print-info 0 *default-log-port* "Attempting to remove bogus pkt file "pktfile)
+				       (delete-file* pktfile))))) ;; remove immediately instead of waiting for on-exit
+		(debug:print 0 *default-log-port* "best-srv-key: "best-srv-key", server-key: "server-key", i-am-srv: "i-am-srv)
+		;; am I the best-srv, compare server-keys to know
+		(if i-am-srv
+		    (if (get-lock-db sdat db-file (servdat-host sdat)(servdat-port sdat)) ;; (db:get-iam-server-lock *dbstruct-db* *toppath* run-id)
+			(begin
+			  (debug:print-info 0 *default-log-port* "I'm the server!")
+			  (servdat-dbfile-set! sdat db-file)
+			  (servdat-status-set! sdat 'db-locked))
+			(begin
+			  (debug:print-info 0 *default-log-port* "I'm not the server, exiting.")
+			  (bdat-time-to-exit-set! *bdat* #t)
+			  (delete-pkt)
+			  (thread-sleep! 0.2)
+			  (exit)))
+		    (begin
+		      (debug:print-info 0 *default-log-port*
+				   "Keys do not match "best-srv-key", "server-key", exiting.")
+		      (bdat-time-to-exit-set! *bdat* #t)
+		      (delete-pkt)
+		      (thread-sleep! 0.2)
+		      (exit)))
+		sdat))
+	    (begin ;; sdat not yet contains server info
+	      (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
+	      (sleep 4)
+	      (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
+		  (begin
+		    (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
+		    (exit))
+		  (loop start-time
+			(equal? sdat last-sdat)
+			sdat))))))))
+
+(define (rmt:register-server sinfo apath iface port server-key dbname)
+  (servdat-conns sinfo) ;; just checking types
+  (rmt:open-main-connection sinfo apath) ;; we need a channel to main.db
+  (rmt:send-receive-real sinfo apath      ;; params: host port servkey pid ipaddr dbpath
+			 (db:run-id->dbname #f)
+			 'register-server `(,iface
+					    ,port
+					    ,server-key
+					    ,(current-process-id)
+					    ,iface
+					    ,apath
+					    ,dbname)))
+
+(define (rmt:get-count-servers sinfo apath)
+  (servdat-conns sinfo) ;; just checking types
+  (rmt:open-main-connection sinfo apath) ;; we need a channel to main.db
+  (rmt:send-receive-real sinfo apath      ;; params: host port servkey pid ipaddr dbpath
+			 (db:run-id->dbname #f)
+			 'get-count-servers `(,apath)))
+
+(define (rmt:get-servers-info apath)
+  (rmt:send-receive 'get-servers-info #f `(,apath)))
+
+(define (rmt:deregister-server db-serv-info apath iface port server-key dbname)
+  (rmt:open-main-connection db-serv-info apath) ;; we need a channel to main.db
+  (rmt:send-receive-real db-serv-info apath      ;; params: host port servkey pid ipaddr dbpath
+                         (db:run-id->dbname #f)
+                         'deregister-server `(,iface
+                                              ,port
+                                              ,server-key
+                                              ,(current-process-id)
+                                              ,iface
+                                              ,apath
+                                              ,dbname)))
+
+(define (rmt:wait-for-stable-interface #!optional (num-tries-allowed 100))
+  ;; wait until *db-serv-info* stops changing
+  (let* ((stime (current-seconds)))
+    (let loop ((last-host  #f)
+	       (last-port  #f)
+	       (tries 0))
+      (let* ((curr-host (and *db-serv-info* (servdat-host *db-serv-info*)))
+	     (curr-port (and *db-serv-info* (servdat-port *db-serv-info*))))
+	;; first we verify port and interface, update *db-serv-info* in need be.
+	(cond
+	 ((> tries num-tries-allowed)
+	  (debug:print 0 *default-log-port* "rmt:keep-running, giving up after trying for several minutes.")
+	  (exit 1))
+	 ((not *db-serv-info*)
+	  (thread-sleep! 0.25)
+	  (loop curr-host curr-port (+ tries 1)))
+	 ((or (not last-host)(not last-port))
+	  (debug:print 0 *default-log-port* "rmt:keep-running, still no interface, tries="tries)
+	  (thread-sleep! 0.25)
+	  (loop curr-host curr-port (+ tries 1)))
+	 ((or (not (equal? last-host curr-host))
+	      (not (equal? last-port curr-port)))
+	  (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info")
+	  (thread-sleep! 0.25)
+	  (loop curr-host curr-port (+ tries 1)))
+	 ((< (- (current-seconds) stime) 1) ;; keep up the looping until at least 3 seconds have passed
+	  (thread-sleep! 0.5)
+	  (loop curr-host curr-port (+ tries 1)))
+	 (else
+	  (rmt:get-signature) ;; sets *my-signature* as side effect
+	  (servdat-status-set! *db-serv-info* 'interface-stable)
+	  (debug:print 0 *default-log-port*
+		       "SERVER STARTED: " curr-host
+		       ":" curr-port
+		       " AT " (current-seconds) " server signature: " *my-signature*
+		       " with "(servdat-trynum *db-serv-info*)" port changes")
+	  (flush-output *default-log-port*)
+	  #t))))))
+
+;; run rmt:keep-running in a parallel thread to monitor that the db is being 
+;; used and to shutdown after sometime if it is not.
+;;
+(define (rmt:keep-running dbname) 
+  ;; if none running or if > 20 seconds since 
+  ;; server last used then start shutdown
+  ;; This thread waits for the server to come alive
+  (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server")
+
+  (let* ((sinfo             *db-serv-info*)
+	 (server-start-time (current-seconds))
+	 (pkts-dir          (get-pkts-dir))
+	 (server-key        (rmt:get-signature)) ;; This servers key
+	 (is-main           (equal? (args:get-arg "-db") ".db/main.db"))
+	 (last-access       0)
+	 (server-timeout    (server:expiration-timeout))
+	 (shutdown-server-sequence (lambda (host port)
+				     (set! *unclean-shutdown* #f) ;; Should not be needed anymore
+				     (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
+				     ;; (rmt:server-shutdown host port) -- called in on-exit
+				     ;; (portlogger:open-run-close portlogger:set-port port "released") called in on-exit
+				     (exit)))
+	 (timed-out?        (lambda ()
+			      (<= (+ last-access server-timeout)
+				  (current-seconds)))))
+    (servdat-dbfile-set! *db-serv-info* (args:get-arg "-db"))
+    ;; main and run db servers have both got wait logic (could/should merge it)
+    (if is-main
+	(rmt:wait-for-server pkts-dir dbname server-key)
+	(rmt:wait-for-stable-interface))
+    ;; this is our forever loop
+    (let* ((iface (servdat-host *db-serv-info*))
+	   (port  (servdat-port *db-serv-info*))
+	   (uconn (servdat-uconn *db-serv-info*)))
+      (let loop ((count          0)
+		 (bad-sync-count 0)
+		 (start-time     (current-milliseconds)))
+	(if (and (not is-main)
+		 (common:low-noise-print 60 "servdat-status"))
+	    (debug:print-info 0 *default-log-port* "servdat-status is " (servdat-status *db-serv-info*)))
+
+	(mutex-lock! *heartbeat-mutex*)
+	;; set up the database handle
+	(if (not *dbstruct-db*) ;; no db opened yet, open the db and register with main if appropriate
+	      (let ((watchdog (bdat-watchdog *bdat*)))
+		(debug:print 0 *default-log-port* "SERVER: dbprep")
+		(db:setup dbname) ;; sets *dbstruct-db* as side effect
+		(servdat-status-set! *db-serv-info* 'db-opened)
+		;; IFF I'm not main, call into main and register self
+		(if (not is-main)
+		    (let ((res (rmt:register-server sinfo
+						    *toppath* iface port
+						    server-key dbname)))
+		      (if res ;; we are the server
+			  (servdat-status-set! *db-serv-info* 'have-interface-and-db)
+			  ;; now check that the db locker is alive, clear it out if not
+			  (let* ((serv-info (rmt:server-info *toppath* dbname)))
+			    (match serv-info
+				   ((host port servkey pid ipaddr apath dbpath)
+				    (if (not (server-ready? uconn (conc host":"port) servkey))
+					(begin
+					  (debug:print-info 0 *default-log-port* "Server registered but not alive. Removing and trying again.")
+					  (rmt:deregister-server sinfo apath host port servkey dbpath) ;; servkey pid ipaddr apath dbpath)
+					  (loop (+ count 1) bad-sync-count start-time))))
+				   (else
+				    (debug:print 0 *default-log-port* "We are not the server for "dbname", exiting. Server info is: "serv-info)
+				    (exit)))))))
+		(debug:print 0 *default-log-port*
+			     "SERVER: running, db "dbname" opened, megatest version: "
+			   (common:get-full-version))
+	      ;; start the watchdog
+
+	      ;; is this really needed?
+	      
+	      #;(if watchdog
+		  (if (not (member (thread-state watchdog)
+				   '(ready running blocked
+					   sleeping dead)))
+		      (begin
+			(debug:print-info 0 *default-log-port* "Starting watchdog thread (in state "(thread-state watchdog)")")
+			(thread-start! watchdog))
+		      (debug:print-info 0 *default-log-port* "Not starting watchdog thread (in state "(thread-state watchdog)")"))
+		  (debug:print 0 *default-log-port* "ERROR: *watchdog* not setup, cannot start it."))
+	      #;(loop (+ count 1) bad-sync-count start-time)
+	      ))
+	
+	(db:sync-inmem->disk *dbstruct-db* *toppath* dbname force-sync: #t)
+	
+	(mutex-unlock! *heartbeat-mutex*)
+	
+	;; when things go wrong we don't want to be doing the various
+	;; queries too often so we strive to run this stuff only every
+	;; four seconds or so.
+	(let* ((sync-time (- (current-milliseconds) start-time))
+	       (rem-time  (quotient (- 4000 sync-time) 1000)))
+	  (if (and (<= rem-time 4)
+		   (>  rem-time 0))
+	      (thread-sleep! rem-time)))
+    
+	;; Transfer *db-last-access* to last-access to use in checking that we are still alive
+	(set! last-access *db-last-access*)
+	
+	(if (< count 1) ;; 3x3 = 9 secs aprox
+	    (loop (+ count 1) bad-sync-count (current-milliseconds)))
+	
+	(if (common:low-noise-print 60 "dbstats")
+	    (begin
+	      (debug:print 0 *default-log-port* "Server stats:")
+	      (db:print-current-query-stats)))
+	(let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600)))
+	  (cond
+	   ((not *server-run*)
+	    (debug:print-info 0 *default-log-port* "*server-run* set to #f. Shutting down.")
+	    (shutdown-server-sequence (get-host-name) port))
+	   ((timed-out?)
+	    (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
+	    (shutdown-server-sequence (get-host-name) port))
+	   ((and *server-run*
+		 (or (not (timed-out?))
+		     (if is-main ;; do not exit if there are other servers (keep main open until all others gone)
+			 (> (rmt:get-count-servers sinfo *toppath*) 1)
+			 #f)))
+	    (if (common:low-noise-print 120 "server continuing")
+		(debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)))
+	    (loop 0 bad-sync-count (current-milliseconds)))
+	   (else
+	    (set! *unclean-shutdown* #f)
+	    (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
+	    (shutdown-server-sequence (get-host-name) port)
+	    #;(debug:print-info 0 *default-log-port* "Sending 'quit to server, received: "
+			      (open-send-receive-nn (conc iface":"port)      ;; do this here and not in server-shutdown
+						    (sexpr->string 'quit))))))))))
+
+(define (rmt:get-reasonable-hostname)
+  (let* ((inhost (or (args:get-arg "-server") "-")))
+    (if (equal? inhost "-")
+	(get-host-name)
+	inhost)))
+
+;; Call this to start the actual server
+;;
+;; all routes though here end in exit ...
+;;
+;; This is the point at which servers are started
+;;
+(define (rmt:server-launch dbname)
+  (debug:print-info 0 *default-log-port* "Entered rmt:server-launch")
+  (let* ((th2 (make-thread (lambda ()
+			     (debug:print-info 0 *default-log-port* "Server run thread started")
+			     (rmt:run (rmt:get-reasonable-hostname)))
+			   "Server run"))
+	 (th3 (make-thread (lambda ()
+			     (debug:print-info 0 *default-log-port* "Server monitor thread started")
+			     (if (args:get-arg "-server")
+				 (rmt:keep-running dbname)))
+			     "Keep running")))
+    (thread-start! th2)
+    (thread-sleep! 0.252) ;; give the server time to settle before starting the keep-running monitor.
+    (thread-start! th3)
+    (set! *didsomething* #t)
+    (thread-join! th2)
+    (thread-join! th3))
+  #f)
+	    
+;;======================================================================
+;;  S E R V E R   -  D I R E C T   C A L L S
+;;======================================================================
+
+(define (rmt:kill-server run-id)
+  (rmt:send-receive 'kill-server #f (list run-id)))
+
+(define (rmt:start-server run-id)
+  (rmt:send-receive 'start-server #f (list run-id)))
+
+(define (rmt:server-info apath dbname)
+  (rmt:send-receive 'get-server-info #f (list apath dbname)))
+
+;;======================================================================
+;; Nanomsg transport
+;;======================================================================
+
+#;(define (is-port-in-use port-num)
+  (let* ((ret #f))
+    (let-values (((inp oup pid)
+		  (process "netstat" (list  "-tulpn" ))))
+      (let loop ((inl (read-line inp)))
+        (if (not (eof-object? inl))
+            (begin 
+	      (if (string-search (regexp (conc ":" port-num)) inl)
+		  (begin
+					;(print "Output: "  inl)
+		    (set! ret  #t))
+		  (loop (read-line inp)))))))
+    ret))
+
+#;(define (open-nn-connection host-port)
+  (let ((req  (make-req-socket))
+        (uri  (conc "tcp://" host-port)))
+    (nng-dial req uri)
+    (socket-set! req 'nng/recvtimeo 2000)
+    req))
+
+#;(define (send-receive-nn req msg)
+  (nng-send req msg)
+  (nng-recv req))
+
+#;(define (close-nn-connection req)
+  (nng-close! req))
+  
+;; ;; open connection to server, send message, close connection
+;; ;;
+;; (define (open-send-close-nn host-port msg #!key (timeout 3) ) ;; default timeout is 3 seconds
+;;   (let ((req  (make-req-socket 'req))
+;;         (uri  (conc "tcp://" host-port))
+;;         (res  #f)
+;;         ;; (contacts (alist-ref 'contact attrib))
+;;         ;; (mode (alist-ref 'mode attrib))
+;; 	)
+;;     (socket-set! req 'nng/recvtimeo 2000)
+;;     (handle-exceptions
+;;      exn
+;;      (let ((emsg ((condition-property-accessor 'exn 'message) exn)))
+;;        ;; Send notification       
+;;        (debug:print 0 *default-log-port* "ERROR: Failed to connect / send to " uri " message was \"" emsg "\"" )
+;;        #f)
+;;      (nng-dial req uri)
+;;      ;; (print "Connected to the server " )
+;;      (nng-send req msg)
+;;      ;; (print "Request Sent")  
+;;      (let* ((th1  (make-thread (lambda ()
+;;                                  (let ((resp (nng-recv req)))
+;;                                    (nng-close! req)
+;;                                    (set! res (if (equal? resp "ok")
+;;                                                  #t
+;;                                                  #f))))
+;;                                "recv thread"))
+;;             (th2 (make-thread (lambda ()
+;;                                 (thread-sleep! timeout)
+;;                                 (thread-terminate! th1))
+;; 			      "timer thread")))
+;;        (thread-start! th1)
+;;        (thread-start! th2)
+;;        (thread-join! th1)
+;;        res))))
+;; 
+#;(define (open-send-receive-nn host-port msg #!key (timeout 3) ) ;; default timeout is 3 seconds
+  (let ((req  (make-req-socket))
+        (uri  (conc "tcp://" host-port))
+        (res  #f)) 
+    (handle-exceptions
+     exn
+     (let ((emsg ((condition-property-accessor 'exn 'message) exn)))
+       ;; Send notification      
+       (debug:print 0 *default-log-port* "ERROR: Failed to connect / send to " uri " message was \"" emsg "\", exn=" exn)
+       #f)
+     (nng-dial req uri)
+     (nng-send req msg)
+     (let* ((th1  (make-thread (lambda ()
+                                 (let ((resp (nng-recv req)))
+                                   (nng-close! req)
+                                   ;; (print resp)
+                                   (set! res resp)))
+                               "recv thread"))
+            (th2 (make-thread (lambda ()
+                                (thread-sleep! timeout)
+                                (thread-terminate! th1))
+                             "timer thread")))
+       (thread-start! th1)
+       (thread-start! th2)
+       (thread-join! th1)
+       res))))
+
+;;======================================================================
+;; S E R V E R   U T I L I T I E S 
+;;======================================================================
+
+;; run ping in separate process, safest way in some cases
+;;
+#;(define (server:ping-server ifaceport)
+  (with-input-from-pipe 
+   (conc (common:get-megatest-exe) " -ping " ifaceport)
+   (lambda ()
+     (let loop ((inl (read-line))
+		(res "NOREPLY"))
+       (if (eof-object? inl)
+	   (case (string->symbol res)
+	     ((NOREPLY)  #f)
+	     ((LOGIN_OK) #t)
+	     (else       #f))
+	   (loop (read-line) inl))))))
+
+;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
+;;
+#;(define (server:login toppath)
+  (lambda (toppath)
+    (set! *db-last-access* (current-seconds)) ;; might not be needed.
+    (if (equal? *toppath* toppath)
+	#t
+	#f)))
+
+;; (define server:sync-lock-token "SERVER_SYNC_LOCK")
+;; (define (server:release-sync-lock)
+;;   (db:no-sync-del! *no-sync-db* server:sync-lock-token))
+;; (define (server:have-sync-lock?)
+;;   (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token))
+;;          (have-lock?     (car have-lock-pair))
+;;          (lock-time      (cdr have-lock-pair))
+;;          (lock-age       (- (current-seconds) lock-time)))
+;;     (cond
+;;      (have-lock? #t)
+;;      ((>lock-age
+;;        (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180)))
+;;       (server:release-sync-lock)
+;;       (server:have-sync-lock?))
+;;      (else #f))))
+
+)

Index: ulex/ulex.scm
==================================================================
--- ulex/ulex.scm
+++ ulex/ulex.scm
@@ -1,8 +1,8 @@
 ;; ulex: Distributed sqlite3 db
 ;;;
-;; Copyright (C) 2018 Matt Welland
+;; Copyright (C) 2018-2021 Matt Welland
 ;; Redistribution and use in source and binary forms, with or without
 ;; modification, is permitted.
 ;;
 ;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 ;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
@@ -23,330 +23,521 @@
 ;; NOTES:
 ;;   Why sql-de-lite and not say, dbi?  - performance mostly, then simplicity.
 ;;
 ;;======================================================================
 
-(use mailbox)
-
-(module ulex
- *
-
-(import scheme posix chicken data-structures ports extras files mailbox)
-(import srfi-18 pkts matchable regex
-	typed-records srfi-69 srfi-1
-	srfi-4 regex-case
-	(prefix sqlite3 sqlite3:)
-	foreign
-	tcp6
-	;; ulex-netutil
-	hostinfo
-	)
-
-;; make it a global? Well, it is local to area module
-
-(define *captain-pktspec*
-  `((captain (host     . h)
-	     (port     . p)
-	     (pid      . i)
-	     (ipaddr   . a)
-	     )
-    #;(data   (hostname . h)  ;; sender hostname
-	    (port     . p)  ;; sender port
-	    (ipaddr   . a)  ;; sender ip
-	    (hostkey  . k)  ;; sending host key - store info at server under this key
-	    (servkey  . s)  ;; server key - this needs to match at server end or reject the msg
-	    (format   . f)  ;; sb=serialized-base64, t=text, sx=sexpr, j=json
-	    (data     . d)  ;; base64 encoded slln data
-	    )))
-
-;; struct for keeping track of our world
-
-(defstruct udat
-  ;; captain info
-  (captain-address #f)
-  (captain-host    #f)
-  (captain-port    #f)
-  (captain-pid     #f)
-  (captain-lease   0)    ;; time (unix epoc) seconds when the lease is up
-  (ulex-dir        (conc (get-environment-variable "HOME") "/.ulex"))
-  (cpkts-dir       (conc (get-environment-variable "HOME") "/.ulex/pkts"))
-  (cpkt-spec       *captain-pktspec*)
-  ;; this processes info
-  (my-cpkt-key     #f)   ;; put Z card here when I create a pkt for myself as captain
-  (my-address      #f)
-  (my-hostname     #f)
-  (my-port         #f)
-  (my-pid          (current-process-id))
-  (my-dbs          '())
-  ;; server and handler thread
-  (serv-listener   #f)                 ;; this processes server info
-  (handler-thread  #f)
-  (mboxes          (make-hash-table))  ;; key => mbox
-  ;; other servers
-  (peers           (make-hash-table))  ;; host-port => peer record
-  (dbowners        (make-hash-table))  ;; dbfile => host-port
-  (handlers        (make-hash-table))  ;; dbfile => proc
-  ;; (outgoing-conns  (make-hash-table))  ;; host:port -> conn
-  (work-queue      (make-queue))       ;; most stuff goes here
-  ;; (fast-queue      (make-queue))       ;; super quick stuff goes here (e.g. ping)
-  (busy            #f)                 ;; is either of the queues busy, use to switch between queuing tasks or doing immediately
-  ;; app info
-  (appname         #f)
-  (dbtypes         (make-hash-table))  ;; this should be an alist but hash is easier. dbtype => [ initproc syncproc ]
-  ;; cookies
-  (cnum            0) ;; cookie num
-  )
-
-;;======================================================================
-;; NEW APPROACH
-;;======================================================================
-
-;;  start-server-find-port  ;; gotta have a server port ready from the very begining
-
-;; udata    - all the connection info, captain, server, ulex db etc. MUST BE PASSED IN
-;; dbpath   - full path and filename of the db to talk to or a symbol naming the db?
-;; callname - the remote call to execute
-;; params   - parameters to pass to the remote call
-;;
-(define (remote-call udata dbpath dbtype callname . params)
-  (start-server-find-port udata) ;; ensure we have a local server
-  (find-or-setup-captain udata)
-  ;; look at connect, process-request, send, send-receive
-  (let-values (((cookie-key host-port)(get-db-owner udata dbpath dbtype)))
-    (send-receive udata host-port callname cookie-key params)))
-
-;;======================================================================
-;; KEY FUNCTIONS - THESE ARE TOO BE EXPOSED AND USED
-;;======================================================================
-
-;; connection setup and management functions
-
-;; This is the basic setup command. Must always be
-;; called before connecting to a db using connect.
-;;
-;; find or become the captain
-;; setup and return a ulex object
-;;
-(define (find-or-setup-captain udata)
-  ;; see if we already have a captain and if the lease is ok
-  (if (and (udat-captain-address udata)
-	   (udat-captain-port    udata)
-	   (< (current-seconds) (udat-captain-lease udata)))
-      udata
-      (let* ((cpkts (get-all-captain-pkts udata)) ;; read captain pkts
-	     (captn (get-winning-pkt cpkts)))
-	(if captn
-	    (let* ((port   (alist-ref 'port   captn))
-		   (host   (alist-ref 'host   captn))
-		   (ipaddr (alist-ref 'ipaddr captn))
-		   (pid    (alist-ref 'pid    captn))
-		   (Z      (alist-ref 'Z      captn)))
-	      (udat-captain-address-set! udata ipaddr)
-	      (udat-captain-host-set!    udata host)
-	      (udat-captain-port-set!    udata port)
-	      (udat-captain-pid-set!     udata pid)
-	      (udat-captain-lease-set!   udata (+ (current-seconds) 10))
-	      (let-values (((success pingtime)(ping udata (conc ipaddr ":" port))))
-		(if success
-		    udata
-		    (begin
-		      (print "Found unreachable captain at " ipaddr ":" port ", removing pkt")
-		      (remove-captain-pkt udata captn)
-		      (find-or-setup-captain udata))))
-	      (begin
-		(setup-as-captain udata)  ;; this saves the thread to captain-thread and starts the thread
-		(find-or-setup-captain udata)))))))
-
-;; connect to a specific dbfile
-;;   - if already connected - return the dbowner host-port
-;;   - ask the captain who to talk to for this db
-;;   - put the entry in the dbowners hash as dbfile => host-port
-;;
-(define (connect udata dbfname dbtype)
-  (or (hash-table-ref/default (udat-dbowners udata) dbfname #f)
-      (let-values (((success dbowner-host-port)(get-db-owner udata dbfname dbtype)))
-	(if success
-	    (begin
-	      ;; just clobber the record, this is the new data no matter what
-	      (hash-table-set! (udat-dbowners udata) dbfname dbowner-host-port)
-	      dbowner-host-port)
-	    #f))))
-
-;; returns: success pingtime
-;;
-;; NOTE: causes the callee to store the info on this host along with the dbs this host currently owns
-;;
-(define (ping udata host-port)
-  (let* ((start  (current-milliseconds))
-	 (cookie (make-cookie udata))
-	 (dbs    (udat-my-dbs udata))
-	 (msg    (string-intersperse dbs " "))
-	 (res (send udata host-port 'ping cookie msg retval: #t))
-	 (delta (- (current-milliseconds) start)))
-    (values (equal? res cookie) delta)))
-
-;; returns: success pingtime
-;;
-;; NOTE: causes all references to this worker to be wiped out in the
-;; callee (ususally the captain)
-;;
-(define (goodbye-ping udata host-port)
-  (let* ((start  (current-milliseconds))
-	 (cookie (make-cookie udata))
-	 (dbs    (udat-my-dbs udata))
-	 (res (send udata host-port 'goodbye cookie "nomsg" retval: #t))
-	 (delta (- (current-milliseconds) start)))
-    (values (equal? res cookie) delta)))
-
-(define (goodbye-captain udata)
-  (let* ((host-port (udat-captain-host-port udata)))
-    (if host-port
-	(goodbye-ping udata host-port)
-	(values #f -1))))
-
-(define (get-db-owner udata dbname dbtype)
-  (let* ((host-port (udat-captain-host-port udata)))
-    (if host-port
-	(let* ((cookie (make-cookie udata))
-	       (msg    #f) ;; (conc dbname " " dbtype))
-	       (params `(,dbname ,dbtype))
-	       (res    (send udata host-port 'db-owner cookie msg
-			     params: params retval: #t)))
-	  (match (string-split res)
-	    ((retcookie owner-host-port)
-	     (values (equal? retcookie cookie) owner-host-port))))
-	(values #f -1))))
-
-;; called in ulex-handler to dispatch work, called on the workers side
-;;     calls (proc params data)
-;;     returns result with cookie
-;;
-;; pdat is the info of the caller, used to send the result data
-;; prockey is key into udat-handlers hash dereferencing a proc
-;; procparam is a first param handed to proc - often to do further derefrencing
-;; NOTE: params is intended to be a list of strings, encoding on data
-;;       is up to the user but data must be a single line
-;;
-(define (process-request udata pdat dbname cookie prockey procparam data)
-  (let* ((dbrec (ulex-open-db udata dbname))     ;; this will be a dbconn record, looks for in udata first
-	 (proc  (hash-table-ref udata prockey)))
-    (let* ((result (proc dbrec procparam data)))
-      result)))
-
-;; remote-request - send to remote to process in process-request
-;; uconn comes from a call to connect and can be used instead of calling connect again
-;; uconn is the host-port to call
-;; we send dbname to the worker so they know which file to open
-;; data must be a string with no newlines, it will be handed to the proc
-;; at the remote site unchanged. It is up to the user to encode/decode it's contents
-;;
-;;   rtype: immediate, read-only, normal, low-priority
-;; 
-(define (remote-request udata uconn rtype dbname prockey procparam data)
-  (let* ((cookie    (make-cookie udata)))
-    (send-receive udata uconn rtype cookie data `(,prockey procparam))))
-
-(define (ulex-open-db udata dbname)
-  #f)
-
-
-;;======================================================================
-;; Ulex db
-;;
-;;   - track who is captain, lease expire time
-;;   - track who owns what db, lease
-;;
-;;======================================================================
-
-;;
-;;
-(define (ulex-dbfname)
-  (let ((dbdir (conc (get-environment-variable "HOME") "/.ulex")))
-    (if (not (file-exists? dbdir))
-	(create-directory dbdir #t))
-    (conc dbdir "/network.db")))
-	 
-;; always goes in ~/.ulex/network.db
-;; role is captain, adjutant, node
-;;
-(define (ulexdb-setup)
-  (let* ((dbfname (ulex-dbfname))
-	 (have-db (file-exists? dbfname))
-	 (db      (sqlite3:open-database dbfname)))
-    (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))
-    (sqlite3:execute db "PRAGMA synchronous = 0;")
-    (if (not have-db)
-	(sqlite3:with-transaction
-	 db
-	 (lambda ()
-	   (for-each
-	    (lambda (stmt)
-	      (if stmt (sqlite3:execute db stmt)))
-	    `("CREATE TABLE IF NOT EXISTS nodes
-                 (id INTEGER PRIMARY KEY,
-                  role  TEXT NOT NULL,
-                  host  TEXT NOT NULL,
-                  port TEXT NOT NULL,
-                  ipadr TEXT NOT NULL,
-                  pid   INTEGER NOT NULL,
-                  zcard TEXT NOT NULL,
-                  regtime INTEGER DEFAULT (strftime('%s','now')),
-                  lease_thru INTEGER DEFAULT (strftime('%s','now')),
-                  last_update INTEGER DEFAULT (strftime('%s','now')));"
-	      "CREATE TRIGGER  IF NOT EXISTS update_nodes_trigger AFTER UPDATE ON nodes
-                             FOR EACH ROW
-                               BEGIN 
-                                 UPDATE nodes SET last_update=(strftime('%s','now'))
-                                   WHERE id=old.id;
-                               END;"
-	      "CREATE TABLE IF NOT EXISTS dbs
-                 (id INTEGER PRIMARY KEY,
-                  dbname TEXT NOT NULL,
-                  dbfile TEXT NOT NULL,
-                  dbtype TEXT NOT NULL,
-                  host_port TEXT NOT NULL,
-                  regtime INTEGER DEFAULT (strftime('%s','now')),
-                  lease_thru INTEGER DEFAULT (strftime('%s','now')),
-                  last_update INTEGER DEFAULT (strftime('%s','now')));"
-	      "CREATE TRIGGER  IF NOT EXISTS update_dbs_trigger AFTER UPDATE ON dbs
-                             FOR EACH ROW
-                               BEGIN 
-                                 UPDATE dbs SET last_update=(strftime('%s','now'))
-                                   WHERE id=old.id;
-                               END;")))))
-    db))
-
-(define (get-host-port-lease db dbfname)
-  (sqlite3:fold-row
-   (lambda (rem host-port lease-thru)
-     (list host-port lease-thru))
-   #f db "SELECT host_port,lease_thru FROM dbs WHERE dbfile = ?" dbfname))
-  
-(define (register-captain db host ipadr port pid zcard #!key (lease 20))
-  (let* ((dbfname (ulex-dbfname))
-	 (host-port  (conc host ":" port)))
-    (sqlite3:with-transaction
-     db
-     (lambda ()
-       (match (get-host-port-lease db dbfname)
-	 ((host-port lease-thru)
-	  (if (> (current-seconds) lease-thru)
-	      (begin
-		(sqlite3:execute db "UPDATE dbs SET host_port=?,lease_thru=? WHERE dbname=?"
-				 (conc host ":" port)
-				 (+ (current-seconds) lease)
-				 dbfname)
-		#t)
-	      #f))
-	 (#f  (sqlite3:execute db "INSERT INTO dbs (dbname,dbfile,dbtype,host_port,lease_thru) VALUES (?,?,?,?,?)"
-			       "captain" dbfname "captain" host-port (+ (current-seconds) lease)))
-	 (else (print "ERROR: Unrecognised result from fold-row")
-	       (exit 1)))))))
-							    
+(module ulex
+	*
+	#;(
+     
+     ;; NOTE: looking for the handler proc - find the run-listener :)
+     
+     run-listener     ;; (run-listener handler-proc [port]) => uconn
+
+     ;; NOTE: handler-proc params;
+     ;;       (handler-proc rem-host-port qrykey cmd params)
+
+     send-receive     ;; (send-receive uconn host-port cmd data)
+
+     ;; NOTE: cmd can be any plain text symbol except for these;
+     ;;         'ping 'ack 'goodbye 'response
+     
+     set-work-handler ;; (set-work-handler proc)
+
+     wait-and-close   ;; (wait-and-close uconn)
+
+     ulex-listener?
+     
+     ;; needed to get the interface:port that was automatically found
+     udat-port
+     udat-host-port
+     
+     ;; for testing only
+     ;; pp-uconn
+     
+     ;; parameters
+     work-method   ;; parameter; 'threads, 'mailbox, 'limited, 'direct
+     return-method ;; parameter; 'mailbox, 'polling, 'direct
+     )
+
+(import scheme
+	chicken.base
+	chicken.file
+	chicken.io
+	chicken.time
+	chicken.condition
+	chicken.string
+	chicken.sort
+	chicken.pretty-print
+	
+	address-info
+	mailbox
+	matchable
+	;; queues
+	regex
+	regex-case
+	simple-exceptions
+	s11n
+	srfi-1
+	srfi-18
+	srfi-4
+	srfi-69
+	system-information
+	tcp6
+	typed-records
+	)
+
+;; udat struct, used by both caller and callee
+;; instantiated as uconn by convention
+;;
+(defstruct udat
+  ;; the listener side
+  (port #f)
+  (host-port #f)
+  (socket #f)
+  ;; the peers
+  (peers  (make-hash-table)) ;; host:port->peer
+  ;; work handling
+  (work-queue (make-mailbox))
+  (work-proc  #f)                ;; set by user
+  (cnum       0)                 ;; cookie number
+  (mboxes     (make-hash-table)) ;; for the replies
+  (avail-cmboxes '())            ;; list of (<cookie> . <mbox>) for re-use
+  ;; threads
+  (numthreads 10)
+  (cmd-thread #f)
+  (work-queue-thread #f)
+  (num-threads-running 0)
+  ) 
+
+;; Parameters
+
+;; work-method:
+(define work-method (make-parameter 'mailbox))
+;;    mailbox - all rdat goes through mailbox
+;;    threads - all rdat immediately executed in new thread
+;;    direct  - no queuing
+;;
+
+;; return-method, return the result to waiting send-receive:
+(define return-method (make-parameter 'mailbox))
+;;    mailbox - create a mailbox and use it for passing returning results to send-receive
+;;    polling - put the result in a hash table keyed by qrykey and send-receive can poll it for result
+;;    direct  - no queuing, result is passed back in single tcp connection
+;;
+
+;; ;; struct for keeping track of others we are talking to
+;; ;;
+;; (defstruct pdat
+;;   (host-port  #f)
+;;   (conns      '()) ;; list of pcon structs, pop one off when calling the peer
+;;   )
+;; 
+;; ;; struct for peer connections, keep track of expiration etc.
+;; ;;
+;; (defstruct pcon
+;;   (inp #f)
+;;   (oup #f)
+;;   (exp (+ (current-seconds) 59)) ;; expires at this time, set to (+ (current-seconds) 59)
+;;   (lifetime (+ (current-seconds) 600)) ;; throw away and create new after five minutes
+;;   )
+
+;;======================================================================
+;; listener
+;;======================================================================
+
+;; is uconn a ulex connector (listener)
+;;
+(define (ulex-listener? uconn)
+  (udat? uconn))
+
+;; create a tcp listener and return a populated udat struct with
+;; my port, address, hostname, pid etc.
+;; return #f if fail to find a port to allocate.
+;;
+;;  if udata-in is #f create the record
+;;  if there is already a serv-listener return the udata
+;;
+(define (setup-listener uconn #!optional (port 4242))
+  (handle-exceptions
+   exn
+   (if (< port 65535)
+       (setup-listener uconn (+ port 1))
+       #f)
+   (connect-listener uconn port)))
+
+(define (connect-listener uconn port)
+  ;; (tcp-listener-socket LISTENER)(socket-name so)
+  ;; sockaddr-address, sockaddr-port, sockaddr->string
+  (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
+	 (addr (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))
+    (udat-port-set!      uconn port)
+    (udat-host-port-set! uconn (conc addr":"port))
+    (udat-socket-set!    uconn tlsn)
+    uconn))
+
+;; run-listener does all the work of starting a listener in a thread
+;; it then returns control
+;;
+(define (run-listener handler-proc #!optional (port-suggestion 4242))
+  (let* ((uconn (make-udat)))
+    (udat-work-proc-set! uconn handler-proc)
+    (if (setup-listener uconn port-suggestion)
+	(let* ((th1 (make-thread (lambda ()(ulex-cmd-loop uconn)) "Ulex command loop"))
+	       (th2 (make-thread (lambda ()
+				   (case (work-method)
+				     ((mailbox limited)
+				      (process-work-queue uconn))))
+				 "Ulex work queue processor")))
+	  ;; (tcp-buffer-size 2048)
+	  (thread-start! th1)
+	  (thread-start! th2)
+	  (udat-cmd-thread-set! uconn th1)
+	  (udat-work-queue-thread-set! uconn th2)
+	  (print "cmd loop and process workers started, listening on "(udat-host-port uconn)".")
+	  uconn)
+	(assert #f "ERROR: run-listener called without proper setup."))))
+
+(define (wait-and-close uconn)
+  (thread-join! (udat-cmd-thread uconn))
+  (tcp-close (udat-socket uconn)))
+
+;;======================================================================
+;; peers and connections
+;;======================================================================
+
+(define *send-mutex* (make-mutex))
+
+;; send structured data to recipient
+;;
+;;  NOTE: qrykey is what was called the "cookie" previously
+;;
+;;     retval tells send to expect and wait for return data (one line) and return it or time out
+;;       this is for ping where we don't want to necessarily have set up our own server yet.
+;;
+;; NOTE: see below for beginnings of code to allow re-use of tcp connections
+;;        - I believe (without substantial evidence) that re-using connections will
+;;          be beneficial ...
+;;
+(define (send udata host-port qrykey cmd params)
+  (let* ((my-host-port (udat-host-port udata))          ;; remote will return to this
+	 (isme         #f #;(equal? host-port my-host-port)) ;; calling myself?
+	 ;; dat is a self-contained work block that can be sent or handled locally
+	 (dat          (list my-host-port qrykey cmd params #;(cons (current-seconds)(current-milliseconds)))))
+    (cond
+     (isme (ulex-handler udata dat)) ;; no transmission needed
+     (else
+      (handle-exceptions ;; TODO - MAKE THIS EXCEPTION CMD SPECIFIC?
+	  exn
+	  (message exn)
+	(begin
+	  ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+	  (let-values (((inp oup)(tcp-connect host-port)))
+	    (let ((res (if (and inp oup)
+			   (begin
+			     (serialize dat oup)
+			     (close-output-port oup)
+			     (deserialize inp)
+			     )
+			   (begin
+			     (print "ERROR: send called but no receiver has been setup. Please call setup first!")
+			     #f))))
+	      (close-input-port inp)
+	      ;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+	      res)))))))) ;; res will always be 'ack unless return-method is direct
+
+(define (send-via-polling uconn host-port cmd data)
+  (let* ((qrykey (make-cookie uconn))
+	 (sres   (send uconn host-port qrykey cmd data)))
+    (case sres
+      ((ack)
+       (let loop ((start-time (current-milliseconds)))
+	 (if (> (current-milliseconds)(+ start-time 10000)) ;; ten seconds timeout
+	     (begin
+	       (print "ULEX ERROR: timed out waiting for response from "host-port", "cmd" "data)
+	       #f)
+	     (let* ((result (hash-table-ref/default (udat-mboxes uconn) qrykey #f))) ;; NOTE: we are re-using mboxes hash
+	       (if result ;; result is '(status . result-data) or #f for nothing yet
+		   (begin
+		     (hash-table-delete! (udat-mboxes uconn) qrykey)
+		     (cdr result))
+		   (begin
+		     (thread-sleep! 0.01)
+		     (loop start-time)))))))
+      (else
+       (print "ULEX ERROR: Communication failed? sres="sres)
+       #f))))
+
+(define (send-via-mailbox uconn host-port cmd data)
+  (let* ((cmbox     (get-cmbox uconn)) ;; would it be better to keep a stack of mboxes to reuse?
+	 (qrykey    (car cmbox))
+	 (mbox      (cdr cmbox))
+	 (mbox-time (current-milliseconds))
+	 (sres      (send uconn host-port qrykey cmd data))) ;; short res
+    (if (eq? sres 'ack) ;; BUG: change to be less than server:expiration-timeout?
+	(let* ((mbox-timeout-secs    120 #;(if (eq? 'primordial (thread-name (current-thread)))
+				     #f
+				     120)) ;; timeout)
+	       (mbox-timeout-result 'MBOX_TIMEOUT)
+	       (res                  (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result))
+	       (mbox-receive-time    (current-milliseconds)))
+	  ;; (put-cmbox uconn cmbox) ;; reuse mbox and cookie. is it worth it?
+	  (hash-table-delete! (udat-mboxes uconn) qrykey)
+	  (if (eq? res 'MBOX_TIMEOUT)
+	      (begin
+		(print "WARNING: mbox timed out for query "cmd", with data "data
+		       ", waiting for response from "host-port".")
+
+		;; here it might make sense to clean up connection records and force clean start?
+		;; NO. The progam using ulex needs to do the reset. Right thing here is exception
+		
+		#f)  ;; convert to raising exception?
+	      res))
+	(begin
+	  (print "ERROR: Communication failed? Got "sres)
+	  #f))))
+  
+;; send a request to the given host-port and register a mailbox in udata
+;; wait for the mailbox data and return it
+;;
+(define (send-receive uconn host-port cmd data)
+  (let* ((start-time (current-milliseconds))
+	 (result     (cond
+		      ((member cmd '(ping goodbye)) ;; these are immediate
+		       (send uconn host-port 'ping cmd data))
+		      ((eq? (work-method) 'direct)
+		       ;; the result from send will be the actual result, not an 'ack
+		       (send uconn host-port 'direct cmd data))
+		      (else
+		       (case (return-method)
+			 ((polling)
+			  (send-via-polling uconn host-port cmd data))
+			 ((mailbox) 
+			  (send-via-mailbox uconn host-port cmd data))
+			 (else
+			  (print "ULEX ERROR: unrecognised return-method "(return-method)".")
+			  #f)))))
+	 (duration    (- (current-milliseconds) start-time)))
+    ;; this is ONLY for development and debugging. It will be removed once Ulex is stable.
+    (if (< 5000 duration)
+	(print "ULEX WARNING: round-trip took "(inexact->exact (round (/ duration 1000)))
+	       " seconds; "cmd", host-port="host-port", data="data))
+    result))
+    
+
+;;======================================================================
+;; responder side
+;;======================================================================
+
+;; take a request, rdat, and if not immediate put it in the work queue
+;;
+;; Reserved cmds; ack ping goodbye response
+;;
+(define (ulex-handler uconn rdat)
+  (assert (list? rdat) "FATAL: ulex-handler give rdat as not list")
+  (match rdat ;;  (string-split controldat)
+    ((rem-host-port qrykey cmd params);; timedata)
+     ;; (print "ulex-handler got: "rem-host-port" qrykey: "qrykey" cmd: "cmd" params: "params)
+     (case cmd
+       ;; ((ack )(print "Got ack! But why? Should NOT get here.") 'ack)
+       ((ping)
+	;; (print "Got Ping!")
+	;; (add-to-work-queue uconn rdat)
+	'ack)
+       ((goodbye)
+	;; just clear out references to the caller. NOT COMPLETE
+	(add-to-work-queue uconn rdat)
+	'ack)
+       ((response) ;; this is a result from remote processing, send it as mail ...
+	(case (return-method)
+	  ((polling)
+	   (hash-table-set! (udat-mboxes uconn) qrykey (cons 'ok params))
+	   'ack)
+	  ((mailbox)
+	   (let ((mbox (hash-table-ref/default (udat-mboxes uconn) qrykey #f)))
+	     (if mbox
+		 (begin
+		   (mailbox-send! mbox params) ;; params here is our result
+		   'ack)
+		 (begin
+		   (print "ERROR: received result but no associated mbox for cookie "qrykey)
+		   'no-mbox-found))))
+	  (else (print "ULEX ERROR: unrecognised return-method "(return-method))
+		'bad-return-method)))
+       (else ;; generic request - hand it to the work queue
+	(add-to-work-queue uconn rdat)
+	'ack)))
+    (else
+     (print "ULEX ERROR: bad rdat "rdat)
+     'bad-rdat)))
+
+;; given an already set up uconn start the cmd-loop
+;;
+(define (ulex-cmd-loop uconn)
+  (let* ((serv-listener (udat-socket uconn))
+	 (listener      (lambda ()
+			  (let loop ((state 'start))
+			    (let-values (((inp oup)(tcp-accept serv-listener)))
+			      ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+			      (let* ((rdat  (deserialize inp)) ;; '(my-host-port qrykey cmd params)
+				     (resp  (ulex-handler uconn rdat)))
+				(serialize resp oup)
+				(close-input-port inp)
+				(close-output-port oup)
+				;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+				)
+			      (loop state))))))
+    ;; start N of them
+    (let loop ((thnum   0)
+	       (threads '()))
+      (if (< thnum 100)
+	  (let* ((th (make-thread listener (conc "listener" thnum))))
+	    (thread-start! th)
+	    (loop (+ thnum 1)
+		  (cons th threads)))
+	  (map thread-join! threads)))))
+
+;; add a proc to the cmd list, these are done symetrically (i.e. in all instances)
+;; so that the proc can be dereferenced remotely
+;;
+(define (set-work-handler uconn proc)
+  (udat-work-proc-set! uconn proc))
+
+;;======================================================================
+;; work queues - this is all happening on the listener side
+;;======================================================================
+
+;; rdat is (rem-host-port qrykey cmd params)
+					     
+(define (add-to-work-queue uconn rdat)
+  #;(queue-add! (udat-work-queue uconn) rdat)
+  (case (work-method)
+    ((threads)
+     (thread-start! (make-thread (lambda ()
+				   (do-work uconn rdat))
+				 "worker thread")))
+    ((mailbox)
+     (mailbox-send! (udat-work-queue uconn) rdat))
+    ((direct)
+     (do-work uconn rdat))
+    (else
+     (print "ULEX ERROR: work-method "(work-method)" not recognised, using mailbox.")
+     (mailbox-send! (udat-work-queue uconn) rdat))))
+     
+;; move the logic to return the result somewhere else?
+;;
+(define (do-work uconn rdat)
+  (let* ((proc (udat-work-proc uconn))) ;; get it each time - conceivebly it could change
+    ;; put this following into a do-work procedure
+    (match rdat
+      ((rem-host-port qrykey cmd params)
+       (let* ((start-time (current-milliseconds))
+	      (result (proc rem-host-port qrykey cmd params))
+	      (end-time (current-milliseconds))
+	      (run-time (- end-time start-time)))
+	 (case (work-method)
+	   ((direct) result)
+	   (else
+	    (if (> run-time 1000)(print "ULEX: Warning, work "cmd", "params" done in "run-time" ms"))
+	    ;; send 'response as cmd and result as params
+	    (send uconn rem-host-port qrykey 'response result) ;; could check for ack
+	    (let* ((duration (- (current-milliseconds) end-time)))
+	      (if (> duration 500)(print "ULEX: Warning, response sent back to "rem-host-port" for "qrykey" in "duration"ms")))))))
+      (MBOX_TIMEOUT 'do-work-timeout)
+      (else
+       (print "ERROR: rdat "rdat", did not match rem-host-port qrykey cmd params")))))
+
+;; NEW APPROACH:
+;;   
+(define (process-work-queue uconn) 
+  (let ((wqueue (udat-work-queue uconn))
+	(proc   (udat-work-proc  uconn))
+	(numthr (udat-numthreads uconn)))
+    (let loop ((thnum    1)
+	       (threads '()))
+      (let ((thlst (cons (make-thread (lambda ()
+					(let work-loop ()
+					  (let ((rdat (mailbox-receive! wqueue 24000 'MBOX_TIMEOUT)))
+					    (do-work uconn rdat))
+					  (work-loop)))
+				      (conc "work thread " thnum))
+			 threads)))
+	(if (< thnum numthr)
+	    (loop (+ thnum 1)
+		  thlst)
+	    (begin
+	      (print "ULEX: Starting "(length thlst)" worker threads.")
+	      (map thread-start! thlst)
+	      (print "ULEX: Threads started. Joining all.")
+	      (map thread-join! thlst)))))))
+
+;; below was to enable re-use of connections. This seems non-trivial so for
+;; now lets open on each call
+;;
+;; ;; given host-port get or create peer struct
+;; ;;
+;; (define (udat-get-peer uconn host-port)
+;;   (or (hash-table-ref/default (udat-peers uconn) host-port #f)
+;;       ;; no peer, so create pdat and init it
+;;       
+;;       ;; NEED stack of connections, pop and use; inp, oup,
+;;       ;; creation_time (remove and create new if over 24hrs old
+;;       ;; 
+;;       (let ((pdat (make-pdat host-port: host-port)))
+;; 	(hash-table-set! (udat-peers uconn) host-port pdat)
+;; 	pdat)))
+;; 
+;; ;; is pcon alive
+;; 
+;; ;; given host-port and pdat get a pcon
+;; ;;
+;; (define (pdat-get-pcon pdat host-port)
+;;   (let loop ((conns (pdat-conns pdat)))
+;;     (if (null? conns) ;; none? make and return - do NOT add - it will be pushed back on list later
+;; 	(init-pcon (make-pcon))
+;; 	(let* ((conn (pop conns)))
+;; 	  
+;; ;; given host-port get a pcon struct
+;; ;;
+;; (define (udat-get-pcon 
+      
+;;======================================================================
+;; misc utils
+;;======================================================================
+
+(define (make-cookie uconn)
+  (let ((newcnum (+ (udat-cnum uconn) 1)))
+    (udat-cnum-set! uconn newcnum)
+    (conc (udat-host-port uconn) ":"
+	  newcnum)))
+
+;; cookie/mboxes
+
+;; we store each mbox with a cookie (<cookie> . <mbox>)
+;;
+(define (get-cmbox uconn)
+  (if (null? (udat-avail-cmboxes uconn))
+      (let ((cookie (make-cookie uconn))
+	    (mbox   (make-mailbox)))
+	(hash-table-set! (udat-mboxes uconn) cookie mbox)
+	`(,cookie . ,mbox))
+      (let ((cmbox (car (udat-avail-cmboxes uconn))))
+	(udat-avail-cmboxes-set! uconn (cdr (udat-avail-cmboxes uconn)))
+	cmbox)))
+
+(define (put-cmbox uconn cmbox)
+  (udat-avail-cmboxes-set! uconn (cons cmbox (udat-avail-cmboxes uconn))))
+
+(define (pp-uconn uconn)
+  (pp (udat->alist uconn)))
+
+  
 ;;======================================================================
 ;; network utilities
 ;;======================================================================
+
+;; NOTE: Look at address-info egg as alternative to some of this
 
 (define (rate-ip ipaddr)
   (regex-case ipaddr
     ( "^127\\..*" _ 0 )
     ( "^(10\\.0|192\\.168)\\..*" _ 1 )
@@ -354,1899 +545,26 @@
 
 ;; Change this to bias for addresses with a reasonable broadcast value?
 ;;
 (define (ip-pref-less? a b)
   (> (rate-ip a) (rate-ip b)))
-  
 
 (define (get-my-best-address)
-  (let ((all-my-addresses (get-all-ips))
-        ;;(all-my-addresses-old (vector->list (hostinfo-addresses (hostname->hostinfo (get-host-name)))))
-        )
+  (let ((all-my-addresses (get-all-ips)))
     (cond
      ((null? all-my-addresses)
       (get-host-name))                                          ;; no interfaces?
      ((eq? (length all-my-addresses) 1)
       (car all-my-addresses))                      ;; only one to choose from, just go with it
-     
      (else
-      (car (sort all-my-addresses ip-pref-less?)))
-     ;; (else 
-     ;;  (ip->string (car (filter (lambda (x)                      ;; take any but 127.
-     ;;    			 (not (eq? (u8vector-ref x 0) 127)))
-     ;;    		       all-my-addresses))))
-
-     )))
+      (car (sort all-my-addresses ip-pref-less?))))))
 
 (define (get-all-ips-sorted)
   (sort (get-all-ips) ip-pref-less?))
 
 (define (get-all-ips)
-  (map ip->string (vector->list 
-		   (hostinfo-addresses
-		    (host-information (current-hostname))))))
-
-(define (udat-my-host-port udata)
-  (if (and (udat-my-address udata)(udat-my-port udata))
-      (conc (udat-my-address udata) ":" (udat-my-port udata))
-      #f))
-
-(define (udat-captain-host-port udata)
-  (if (and (udat-captain-address udata)(udat-captain-port udata))
-      (conc (udat-captain-address udata) ":" (udat-captain-port udata))
-      #f))
-
-(define (udat-get-peer udata host-port)
-  (hash-table-ref/default (udat-peers udata) host-port #f))
-
-;; struct for keeping track of others we are talking to
-
-(defstruct peer
-  (addr-port       #f)
-  (hostname        #f)
-  (pid             #f)
-  ;; (inp             #f)
-  ;; (oup             #f)
-  (dbs            '()) ;; list of databases this peer is currently handling
-  )
-
-(defstruct work
-  (peer-dat   #f)
-  (handlerkey #f)
-  (qrykey     #f)
-  (data       #f)
-  (start      (current-milliseconds)))
-
-#;(defstruct dbowner
-  (pdat        #f)
-  (last-update (current-seconds)))
-
-;;======================================================================
-;; Captain functions
-;;======================================================================
-
-;; NB// This needs to be started in a thread
-;;
-;; setup to be a captain
-;;   - local server MUST be started already
-;;   - create pkt
-;;   - start server port handler
-;;
-(define (setup-as-captain udata)
-  (if (create-captain-pkt udata)
-      (let* ((my-addr (udat-my-address udata))
-	     (my-port (udat-my-port    udata))
-	     (th (make-thread (lambda ()
-				(ulex-handler-loop udata)) "Captain handler")))
-	(udat-handler-thread-set! udata th)
-	(udat-captain-address-set! udata my-addr)
-	(udat-captain-port-set!    udata my-port)
-	(thread-start! th))
-      (begin
-	(print "ERROR: failed to create captain pkt")
-	#f)))
-
-;; given a pkts dir read 
-;;
-(define (get-all-captain-pkts udata)
-  (let* ((pktsdir       (let ((d (udat-cpkts-dir udata)))
-			  (if (file-exists? d)
-			      d
-			      (begin
-				(create-directory d #t)
-				d))))
-	 (all-pkt-files (glob (conc pktsdir "/*.pkt")))
-	 (pkt-spec      (udat-cpkt-spec udata)))
-    (map (lambda (pkt-file)
-	   (read-pkt->alist pkt-file pktspec: pkt-spec))
-	 all-pkt-files)))
-
-;; sort by D then Z, return one, choose the oldest then
-;; differentiate if needed using the Z key
-;;l
-(define (get-winning-pkt pkts)
-  (if (null? pkts)
-      #f
-      (car (sort pkts (lambda (a b)
-			(let ((ad (string->number (alist-ref 'D a)))
-			      (bd (string->number (alist-ref 'D b))))
-			  (if (eq? a b)
-			      (let ((az (alist-ref 'Z a))
-				    (bz (alist-ref 'Z b)))
-				(string>=? az bz))
-			      (> ad bd))))))))
-
-;; put the host, ip, port and pid into a pkt in
-;; the captain pkts dir
-;;  - assumes user has already fired up a server
-;;    which will be in the udata struct
-;;
-(define (create-captain-pkt udata)
-  (if (not (udat-serv-listener udata))
-      (begin
-	(print "ERROR: create-captain-pkt called with out a listener")
-	#f)
-      (let* ((pktdat `((port   . ,(udat-my-port udata))
-		       (host   . ,(udat-my-hostname udata))
-		       (ipaddr . ,(udat-my-address udata))
-		       (pid    . ,(udat-my-pid     udata))))
-	     (pktdir  (udat-cpkts-dir udata))
-	     (pktspec (udat-cpkt-spec udata))
-	     )
-	(udat-my-cpkt-key-set!
-	 udata
-	 (write-alist->pkt
-	  pktdir
-	  pktdat
-	  pktspec: pktspec
-	  ptype:   'captain))
-	(udat-my-cpkt-key udata))))
-
-;; remove pkt associated with captn (the Z key .pkt)
-;;
-(define (remove-captain-pkt udata captn)
-  (let ((Z       (alist-ref 'Z captn))
-	(cpktdir (udat-cpkts-dir udata)))
-    (delete-file* (conc cpktdir "/" Z ".pkt"))))
-
-;; call all known peers and tell them to delete their info on the captain
-;; thus forcing them to re-read pkts and connect to a new captain
-;; call this when the captain needs to exit and if an older captain is
-;; detected. Due to delays in sending file meta data in NFS multiple
-;; captains can be initiated in a "Storm of Captains", book soon to be
-;; on Amazon
-;;
-(define (drop-captain udata)
-  (let* ((peers (hash-table-keys (udat-peers udata)))
-	 (cookie (make-cookie udata)))
-    (for-each
-     (lambda (host-port)
-       (send udata host-port 'dropcaptain cookie "nomsg" retval: #t))
-     peers)))
-
-;;======================================================================
-;; server primitives
-;;======================================================================
-
-(define (make-cookie udata)
-  (let ((newcnum (+ (udat-cnum udata) 1)))
-    (udat-cnum-set! udata newcnum)
-    (conc (udat-my-address udata) ":"
-	  (udat-my-port    udata) "-"
-	  (udat-my-pid     udata) "-"
-	  newcnum)))
-
-;; create a tcp listener and return a populated udat struct with
-;; my port, address, hostname, pid etc.
-;; return #f if fail to find a port to allocate.
-;;
-;;  if udata-in is #f create the record
-;;  if there is already a serv-listener return the udata
-;;
-(define (start-server-find-port udata-in #!optional (port 4242))
-  (let ((udata (or udata-in (make-udat))))
-    (if (udat-serv-listener udata) ;; TODO - add check that the listener is alive and ready?
-	udata
-	(handle-exceptions
-	    exn
-	  (if (< port 65535)
-	      (start-server-find-port udata (+ port 1))
-	      #f)
-	  (connect-server udata port)))))
-
-(define (connect-server udata port)
-  ;; (tcp-listener-socket LISTENER)(socket-name so)
-  ;; sockaddr-address, sockaddr-port, sockaddr->string
-  (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
-	 (addr (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))
-    (udat-my-address-set!    udata addr)
-    (udat-my-port-set!       udata port)
-    (udat-my-hostname-set!   udata (get-host-name))
-    (udat-serv-listener-set! udata tlsn)
-    udata))
-
-(define (get-peer-dat udata host-port #!optional (hostname #f)(pid #f))
-  (let* ((pdat (or (udat-get-peer udata host-port)
-		   (handle-exceptions ;; ERROR - MAKE THIS EXCEPTION HANDLER MORE SPECIFIC
-		    exn
-		    #f
-		    (let ((npdat (make-peer addr-port: host-port)))
-		      (if hostname (peer-hostname-set! npdat hostname))
-		      (if pid (peer-pid-set! npdat pid))
-		      npdat)))))
-    pdat))
-
-;; send structured data to recipient
-;;
-;;  NOTE: qrykey is what was called the "cookie" previously
-;;
-;;     retval tells send to expect and wait for return data (one line) and return it or time out
-;;       this is for ping where we don't want to necessarily have set up our own server yet.
-;;
-(define (send udata host-port handler qrykey data
-	      #!key (hostname #f)(pid #f)(params '())(retval #f))
-  (let* ((my-host-port (udat-my-host-port udata))
-	 (isme         (equal? host-port my-host-port)) ;; am I calling
-							;; myself?
-	 (dat          (list
-			handler              ;; " "
-			my-host-port         ;; " "
-			(udat-my-pid  udata) ;; " "
-			qrykey
-			params ;;(if (null? params) "" (conc " "
-			       ;;(string-intersperse params " ")))
-			)))
-    ;; (print "send isme is " (if isme "true!" "false!") ",
-    ;; my-host-port: " my-host-port ", host-port: " host-port)
-    (if isme
-	(ulex-handler udata dat data)
-	(handle-exceptions ;; ERROR - MAKE THIS EXCEPTION HANDLER MORE
-			   ;; SPECIFIC
-	    exn
-	    #f 
-	  (let-values (((inp oup)(tcp-connect host-port)))
-	    ;;
-	    ;; CONTROL LINE:
-	    ;;    handlerkey host:port pid qrykey params ...
-	    ;;
-	    (let ((res
-		   (if (and inp oup)
-		       (let* ()
-			 (if my-host-port
-			     (begin
-			       (write dat  oup)
-			       (write data oup) ;; send as sexpr
-			       ;; (print "Sent dat: " dat " data: " data)
-			       (if retval
-				   (read inp)
-				   #t))
-			     (begin
-			       (print "ERROR: send called but no receiver has been setup. Please call setup first!")
-			       #f))
-			 ;; NOTE: DO NOT BE TEMPTED TO LOOK AT ANY DATA ON INP HERE!
-			 ;;       (there is a listener for handling that)
-			 )
-		       #f))) ;; #f means failed to connect and send
-	      (close-input-port inp)
-	      (close-output-port oup)
-	      res))))))
-
-;; send a request to the given host-port and register a mailbox in udata
-;; wait for the mailbox data and return it
-;;
-(define (send-receive udata host-port handler qrykey data #!key (hostname #f)(pid #f)(params '())(timeout 20))
-  (let ((mbox      (make-mailbox))
-	(mbox-time (current-milliseconds))
-	(mboxes    (udat-mboxes udata)))
-    (hash-table-set! mboxes qrykey mbox)
-    (if (send udata host-port handler qrykey data hostname: hostname pid: pid params: params)
-	(let* ((mbox-timeout-secs    timeout)
-	       (mbox-timeout-result 'MBOX_TIMEOUT)
-	       (res                  (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result))
-	       (mbox-receive-time    (current-milliseconds)))
-	  (hash-table-delete! mboxes qrykey)
-	  (if (eq? res 'MBOX_TIMEOUT)
-	      #f
-	      res))
-	#f))) ;; #f means failed to communicate
-
-;; 
-(define (ulex-handler udata controldat data)
-  (print "controldat: " controldat " data: " data)
-  (match controldat ;;  (string-split controldat)
-    ((handlerkey host-port pid qrykey params ...)
-     ;; (print "handlerkey: " handlerkey " host-port: " host-port " pid: " pid " qrykey: " qrykey " params: " params)
-     (case handlerkey ;; (string->symbol handlerkey)
-       ((ack)(print "Got ack!"))
-       ((ping) ;; special case - return result immediately on the same connection
-	(let* ((proc  (hash-table-ref/default (udat-handlers udata) 'ping #f))
-	       (val   (if proc (proc) "gotping"))
-	       (peer  (make-peer addr-port: host-port pid: pid))
-	       (dbshash (udat-dbowners udata)))
-	  (peer-dbs-set! peer params) ;; params for ping is list of dbs owned by pinger
-	  (for-each (lambda (dbfile)
-		      (hash-table-set! dbshash dbfile host-port)) ;; WRONG?
-		    params) ;; register each db in the dbshash
-	  (if (not (hash-table-exists? (udat-peers udata) host-port))
-	      (hash-table-set! (udat-peers udata) host-port peer)) ;; save the details of this caller in peers
-	  qrykey)) ;; End of ping
-       ((goodbye)
-	;; remove all traces of the caller in db ownership etc.
-	(let* ((peer  (hash-table-ref/default (udat-peers udata) host-port #f))
-	       (dbs   (if peer (peer-dbs peer) '()))
-	       (dbshash (udat-dbowners udata)))
-	  (for-each (lambda (dbfile)(hash-table-delete! dbshash dbfile)) dbs)
-	  (hash-table-delete! (udat-peers udata) host-port)
-	  qrykey))
-       ((dropcaptain)
-	;; remove all traces of the captain
-	(udat-captain-address-set! udata #f)
-	(udat-captain-host-set!    udata #f)
-	(udat-captain-port-set!    udata #f)
-	(udat-captain-pid-set!     udata #f)
-	qrykey)
-       ((rucaptain) ;; remote is asking if I'm the captain
-	(if (udat-my-cpkt-key udata) "yes" "no"))
-       ((db-owner) ;; given a db name who do I send my queries to
-	;; look up the file in handlers, if have an entry ping them to be sure
-	;; they are still alive and then return that host:port.
-	;; if no handler found or if the ping fails pick from peers the oldest that
-	;; is managing the fewest dbs
-	(match params
-	  ((dbfile dbtype)
-	   (let* ((owner-host-port (hash-table-ref/default (udat-dbowners udata) dbfile #f)))
-	     (if owner-host-port
-		 (conc qrykey " " owner-host-port)
-		 (let* ((pdat (or (hash-table-ref/default (udat-peers udata) host-port #f) ;; no owner - caller gets to own it!
-				  (make-peer addr-port: host-port pid: pid dbs: `(,dbfile)))))
-		   (hash-table-set! (udat-peers udata) host-port pdat)
-		   (hash-table-set! (udat-dbowners udata) dbfile host-port)
-		   (conc qrykey " " host-port)))))
-	  (else (conc qrykey " BADDATA"))))
-       ;; for work items:
-       ;;    handler is one of; immediate, read-only, read-write, high-priority
-       ((immediate read-only normal low-priority) ;; do this work immediately
-	;; host-port (caller), pid (caller), qrykey (cookie), params <= all from first line
-	;; data => a single line encoded however you want, or should I build json into it?
-	(print "handlerkey=" handlerkey)
-	(let* ((pdat (get-peer-dat udata host-port)))
-	  (match params ;; dbfile prockey procparam
-	    ((dbfile prockey procparam)
-	     (case handlerkey
-	       ((immediate read-only)
-		(process-request udata pdat dbfile qrykey prockey procparam data))
-	       ((normal low-priority) ;; split off later and add logic to support low priority
-		(add-to-work-queue udata pdat dbfile qrykey prockey procparam data))
-	       (else
-		#f)))
-	    (else
-	     (print "INFO: params=" params " handlerkey=" handlerkey " controldat=" controldat)
-	     #f))))
-       (else
-	;; (add-to-work-queue udata (get-peer-dat udata host-port) handlerkey qrykey data)
-	#f)))
-    (else
-     (print "BAD DATA? controldat=" controldat " data=" data)
-     #f)));; handles the incoming messages and dispatches to queues
-
-;;
-(define (ulex-handler-loop udata)
-  (let* ((serv-listener (udat-serv-listener udata)))
-    ;; data comes as two lines
-    ;;   handlerkey resp-addr:resp-port hostname pid qrykey [dbpath/dbfile.db]
-    ;;   data
-    (let loop ((state 'start))
-      (let-values (((inp oup)(tcp-accept serv-listener)))
-	(let* ((controldat (read inp))
-	       (data       (read inp))
-	       (resp       (ulex-handler udata controldat data)))
-	  (if resp (write resp oup))
-	  (close-input-port inp)
-	  (close-output-port oup))
-	(loop state)))))
-
-;; add a proc to the handler list, these are done symetrically (i.e. in all instances)
-;; so that the proc can be dereferenced remotely
-;;
-(define (register-handler udata key proc)
-  (hash-table-set! (udat-handlers udata) key proc))
-
-
-;;======================================================================
-;; work queues
-;;======================================================================
-
-(define (add-to-work-queue udata peer-dat handlerkey qrykey data)
-  (let ((wdat (make-work peer-dat: peer-dat handlerkey: handlerkey qrykey: qrykey data: data)))
-    (if (udat-busy udata)
-	(queue-add! (udat-work-queue udata) wdat)
-	(process-work udata wdat)) ;; passing in wdat tells process-work to first process the passed in wdat
-    ))
-
-(define (do-work udata wdat)
-  #f)
-
-(define (process-work udata #!optional wdat)
-  (if wdat (do-work udata wdat)) ;; process wdat
-  (let ((wqueue (udat-work-queue udata)))
-    (if (not (queue-empty? wqueue))
-	(let loop ((wd (queue-remove! wqueue)))
-	  (do-work udata wd)
-	  (if (not (queue-empty? wqueue))
-	      (loop (queue-remove! wqueue)))))))
-
-;;======================================================================
-;; Generic db handling
-;;   setup a inmem db instance
-;;   open connection to on-disk db
-;;   sync on-disk db to inmem
-;;   get lock in on-disk db for dbowner of this db
-;;   put sync-proc, init-proc, on-disk handle, inmem handle in dbconn stuct
-;;   return the stuct
-;;======================================================================
-
-(defstruct dbconn
-  (fname  #f)
-  (inmem  #f)
-  (conn   #f)
-  (sync   #f) ;; sync proc
-  (init   #f) ;; init proc
-  (lastsync (current-seconds))
-  )
-
-(defstruct dbinfo
-  (initproc #f)
-  (syncproc #f))
-
-;; open inmem and disk database
-;;   init with initproc
-;;   return db struct
-;;
-;;   appname; megatest, ulex or something else.
-;;
-(define (setup-db-connection udata fname-in appname dbtype)
-  (let* ((is-ulex (eq? appname 'ulex))
-	 (dbinf   (if is-ulex ;; ulex is a built-in special case
-		      (make-dbinfo initproc: ulexdb-init syncproc: ulexdb-sync)
-		      (hash-table-ref/default (udat-dbtypes udata) dbtype #f)))
-	 (initproc (dbinfo-initproc dbinf))
-	 (syncproc (dbinfo-syncproc dbinf))
-	 (fname   (if is-ulex
-		      (conc (udat-ulex-dir udata) "/ulex.db")
-		      fname-in))
-	 (inmem-db (open-and-initdb udata #f 'inmem (dbinfo-initproc dbinf)))
-	 (disk-db  (open-and-initdb udata fname 'disk (dbinfo-initproc dbinf))))
-    (make-dbconn inmem: inmem-db conn: disk-db sync: syncproc init: initproc)))
-
-;; dest='inmem or 'disk
-;;
-(define (open-and-initdb udata filename dest init-proc)
-  (let* ((inmem    (eq? dest 'inmem))
-	 (dbfile   (if inmem
-		       ":INMEM:"
-		       filename))
-	 (dbexists (if inmem #t (file-exists? dbfile)))
-	 (db       (sqlite3:open-database dbfile)))
-    (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))
-    (if (not dbexists)
-	(init-proc db))
-    db))
-
-
-;;======================================================================
-;; Previous Ulex db stuff
-;;======================================================================
-
-(define (ulexdb-init db inmem)
-  (sqlite3:with-transaction
-   db
-   (lambda ()
-     (for-each
-      (lambda (stmt)
-	(if stmt (sqlite3:execute db stmt)))
-      `("CREATE TABLE IF NOT EXISTS processes 
-                 (id INTEGER PRIMARY KEY,
-                  host  TEXT NOT NULL,
-                  ipadr TEXT NOT NULL,
-                  port  INTEGER NOT NULL,
-                  pid   INTEGER NOT NULL,
-                  regtime INTEGER DEFAULT (strftime('%s','now')),
-                  last_update INTEGER DEFAULT (strftime('%s','now')));"
-	(if inmem
-	    "CREATE TRIGGER  IF NOT EXISTS update_proces_trigger AFTER UPDATE ON processes
-                             FOR EACH ROW
-                               BEGIN 
-                                 UPDATE processes SET last_update=(strftime('%s','now'))
-                                   WHERE id=old.id;
-                               END;"
-	    #f))))))
-
-;; open databases, do initial sync
-(define (ulexdb-sync dbconndat udata)
-  #f)
-
-
-) ;; END OF ULEX
-
-
-;;; ;;======================================================================
-;;; ;; D E B U G   H E L P E R S
-;;; ;;======================================================================
-;;;     
-;;; (define (dbg> . args)
-;;;   (with-output-to-port (current-error-port)
-;;;     (lambda ()
-;;;       (apply print "dbg> " args))))
-;;; 
-;;; (define (debug-pp . args)
-;;;   (if (get-environment-variable "ULEX_DEBUG")
-;;;       (with-output-to-port (current-error-port)
-;;; 	(lambda ()
-;;; 	  (apply pp args)))))
-;;; 
-;;; (define *default-debug-port* (current-error-port))
-;;; 
-;;; (define (sdbg> fn stage-name stage-start stage-end start-time . message)
-;;;   (if (get-environment-variable "ULEX_DEBUG")
-;;;       (with-output-to-port *default-debug-port* 
-;;; 	(lambda ()
-;;; 	  (apply print "ulex:" fn " " stage-name " took " (- (if stage-end stage-end (current-milliseconds)) stage-start) " ms. "
-;;; 		 (if start-time
-;;; 		     (conc "total time " (- (current-milliseconds) start-time)
-;;; 			   " ms.")
-;;; 		     "")
-;;; 		 message
-;;; 		 )))))
-
-;;======================================================================
-;; M A C R O S
-;;======================================================================
-;; iup callbacks are not dumping the stack, this is a work-around
-;;
-
-;; Some of these routines use:
-;;
-;;     http://www.cs.toronto.edu/~gfb/scheme/simple-macros.html
-;;
-;; Syntax for defining macros in a simple style similar to function definiton,
-;;  when there is a single pattern for the argument list and there are no keywords.
-;;
-;; (define-simple-syntax (name arg ...) body ...)
-;;
-;; 
-;; (define-syntax define-simple-syntax
-;;   (syntax-rules ()
-;;     ((_ (name arg ...) body ...)
-;;      (define-syntax name (syntax-rules () ((name arg ...) (begin body ...)))))))
-;; 
-;; (define-simple-syntax (catch-and-dump proc procname)
-;;   (handle-exceptions
-;;    exn
-;;    (begin
-;;      (print-call-chain (current-error-port))
-;;      (with-output-to-port (current-error-port)
-;;        (lambda ()
-;;          (print ((condition-property-accessor 'exn 'message) exn))
-;;          (print "Callback error in " procname)
-;;          (print "Full condition info:\n" (condition->list exn)))))
-;;    (proc)))
-;; 
-;; 
-;;======================================================================
-;;  R E C O R D S
-;;======================================================================
-
-;;; ;; information about me as a server
-;;; ;;
-;;; (defstruct area
-;;;   ;; about this area
-;;;   (useportlogger #f)
-;;;   (lowport       32768)
-;;;   (server-type   'auto)  ;; auto=create up to five servers/pkts, main=create pkts, passive=no pkt (unless there are no pkts at all)
-;;;   (conn          #f)
-;;;   (port          #f)
-;;;   (myaddr        (get-my-best-address))
-;;;   pktid          ;; get pkt from hosts table if needed
-;;;   pktfile
-;;;   pktsdir
-;;;   dbdir
-;;;   (dbhandles     (make-hash-table)) ;; fname => list-of-dbh, NOTE: Should really never need more than one?
-;;;   (mutex         (make-mutex))
-;;;   (rtable        (make-hash-table)) ;; registration table of available actions
-;;;   (dbs           (make-hash-table)) ;; filename => random number, used for choosing what dbs I serve
-;;;   ;; about other servers
-;;;   (hosts         (make-hash-table)) ;; key => hostdat
-;;;   (hoststats     (make-hash-table)) ;; key => alist of fname => ( qcount . qtime )
-;;;   (reqs          (make-hash-table)) ;; uri => queue
-;;;   ;; work queues
-;;;   (wqueues       (make-hash-table)) ;; fname => qdat
-;;;   (stats         (make-hash-table)) ;; fname => totalqueries
-;;;   (last-srvup    (current-seconds)) ;; last time we updated the known servers
-;;;   (cookie2mbox   (make-hash-table)) ;; map cookie for outstanding request to mailbox of awaiting call
-;;;   (ready #f)
-;;;   (health        (make-hash-table)) ;; ipaddr:port => num failed pings since last good ping
-;;;   )
-;;; 
-;;; ;; host stats
-;;; ;;
-;;; (defstruct hostdat
-;;;   (pkt      #f)
-;;;   (dbload   (make-hash-table))  ;; "dbfile.db" => queries/min
-;;;   (hostload #f)                 ;; normalized load ( 5min load / numcpus )
-;;;   )
-;;; 
-;;; ;; dbdat
-;;; ;;
-;;; (defstruct dbdat
-;;;   (dbh    #f)
-;;;   (fname  #f)
-;;;   (write-access #f)
-;;;   (sths   (make-hash-table))  ;; hash mapping query strings to handles
-;;;   )
-;;; 
-;;; ;; qdat
-;;; ;;
-;;; (defstruct qdat
-;;;   (writeq  (make-queue))
-;;;   (readq   (make-queue))
-;;;   (rwq     (make-queue))
-;;;   (logq    (make-queue)) ;; do we need a queue for logging? yes, if we use sqlite3 db for logging
-;;;   (osshort (make-queue))
-;;;   (oslong  (make-queue))
-;;;   (misc    (make-queue)) ;; used for things like ping-full
-;;;   )
-;;; 
-;;; ;; calldat
-;;; ;;
-;;; (defstruct calldat
-;;;   (ctype 'dbwrite)
-;;;   (obj   #f)              ;; this would normally be an SQL statement e.g. SELECT, INSERT etc.
-;;;   (rtime (current-milliseconds)))
-;;; 
-;;; ;; make it a global? Well, it is local to area module
-;;; 
-;;; (define *pktspec*
-;;;   `((server (hostname . h)
-;;; 	    (port     . p)
-;;; 	    (pid      . i)
-;;; 	    (ipaddr   . a)
-;;; 	    )
-;;;     (data   (hostname . h)  ;; sender hostname
-;;; 	    (port     . p)  ;; sender port
-;;; 	    (ipaddr   . a)  ;; sender ip
-;;; 	    (hostkey  . k)  ;; sending host key - store info at server under this key
-;;; 	    (servkey  . s)  ;; server key - this needs to match at server end or reject the msg
-;;; 	    (format   . f)  ;; sb=serialized-base64, t=text, sx=sexpr, j=json
-;;; 	    (data     . d)  ;; base64 encoded slln data
-;;; 	    )))
-;;; 
-;;; ;; work item
-;;; ;;
-;;; (defstruct witem
-;;;   (rhost #f)   ;; return host
-;;;   (ripaddr #f) ;; return ipaddr
-;;;   (rport #f)   ;; return port
-;;;   (servkey #f) ;; the packet representing the client of this workitem, used by final send-message
-;;;   (rdat  #f)   ;; the request - usually an sql query, type is rdat
-;;;   (action #f)  ;; the action: immediate, dbwrite, dbread,oslong, osshort
-;;;   (cookie #f)  ;; cookie id for response
-;;;   (data   #f)  ;; the data payload, i.e. parameters
-;;;   (result #f)  ;; the result from processing the data
-;;;   (caller #f)) ;; the calling peer according to rpc itself
-;;; 
-;;; (define (trim-pktid pktid)
-;;;   (if (string? pktid)
-;;;       (substring pktid 0 4)
-;;;       "nopkt"))
-;;; 
-;;; (define (any->number num)
-;;;   (cond
-;;;    ((number? num) num)
-;;;    ((string? num) (string->number num))
-;;;    (else num)))
-;;; 
-;;; (use trace)
-;;; (trace-call-sites #t)
-;;; 
-;;; ;;======================================================================
-;;; ;; D A T A B A S E   H A N D L I N G 
-;;; ;;======================================================================
-;;; 
-;;; ;; look in dbhandles for a db, return it, else return #f
-;;; ;;
-;;; (define (get-dbh acfg fname)
-;;;   (let ((dbh-lst (hash-table-ref/default (area-dbhandles acfg) fname '())))
-;;;     (if (null? dbh-lst)
-;;; 	(begin
-;;; 	  ;; (print "opening db for " fname)
-;;; 	  (open-db acfg fname)) ;; Note that the handles get put back in the queue in the save-dbh calls
-;;; 	(let ((rem-lst (cdr dbh-lst)))
-;;; 	  ;; (print "re-using saved connection for " fname)
-;;; 	  (hash-table-set! (area-dbhandles acfg) fname rem-lst)
-;;; 	  (car dbh-lst)))))
-;;; 
-;;; (define (save-dbh acfg fname dbdat)
-;;;     ;; (print "saving dbh for " fname)
-;;;     (hash-table-set! (area-dbhandles acfg) fname (cons dbdat (hash-table-ref/default (area-dbhandles acfg) fname '()))))
-;;; 
-;;; ;; open the database, if never before opened init it. put the handle in the
-;;; ;; open db's hash table
-;;; ;; returns: the dbdat
-;;; ;;
-;;; (define (open-db acfg fname)
-;;;   (let* ((fullname     (conc (area-dbdir acfg) "/" fname))
-;;; 	 (exists       (file-exists? fullname))
-;;; 	 (write-access (if exists
-;;; 			   (file-write-access? fullname)
-;;; 			   (file-write-access? (area-dbdir acfg))))
-;;; 	 (db           (sqlite3:open-database fullname))
-;;; 	 (handler      (sqlite3:make-busy-timeout 136000))
-;;; 	 )
-;;;     (sqlite3:set-busy-handler! db handler)
-;;;     (sqlite3:execute db "PRAGMA synchronous = 0;")
-;;;     (if (not exists) ;; need to init the db
-;;; 	(if write-access
-;;; 	    (let ((isql (get-rsql acfg 'dbinitsql))) ;; get the init sql statements
-;;; 	      ;; (sqlite3:with-transaction
-;;; 	      ;;  db
-;;; 	      ;;  (lambda ()
-;;; 		 (if isql
-;;; 		     (for-each
-;;; 		      (lambda (sql)
-;;; 			(sqlite3:execute db sql))
-;;; 		      isql)))
-;;; 	    (print "ERROR: no write access to " (area-dbdir acfg))))
-;;;     (make-dbdat dbh: db fname: fname write-access: write-access)))
-;;; 
-;;; ;; This is a low-level command to retrieve or to prepare, save and return a prepared statment
-;;; ;; you must extract the db handle
-;;; ;;
-;;; (define (get-sth db cache stmt)
-;;;   (if (hash-table-exists? cache stmt)
-;;;       (begin
-;;; 	;; (print "Reusing cached stmt for " stmt)
-;;; 	(hash-table-ref/default cache stmt #f))
-;;;       (let ((sth (sqlite3:prepare db stmt)))
-;;; 	(hash-table-set! cache stmt sth)
-;;; 	;; (print "prepared stmt for " stmt)
-;;; 	sth)))
-;;; 
-;;; ;; a little more expensive but does all the tedious deferencing - only use if you don't already
-;;; ;; have dbdat and db sitting around
-;;; ;;
-;;; (define (full-get-sth acfg fname stmt)
-;;;   (let* ((dbdat  (get-dbh acfg fname))
-;;; 	 (db     (dbdat-dbh dbdat))
-;;; 	 (sths   (dbdat-sths dbdat)))
-;;;     (get-sth db sths stmt)))
-;;; 
-;;; ;; write to a db
-;;; ;; acfg: area data
-;;; ;; rdat: request data
-;;; ;; hdat: (host . port)
-;;; ;;
-;;; ;; (define (dbwrite acfg rdat hdat data-in)
-;;; ;;   (let* ((dbname (car data-in))
-;;; ;; 	 (dbdat  (get-dbh acfg dbname))
-;;; ;; 	 (db     (dbdat-dbh dbdat))
-;;; ;; 	 (sths   (dbdat-sths dbdat))
-;;; ;; 	 (stmt   (calldat-obj rdat))
-;;; ;; 	 (sth    (get-sth db sths stmt))
-;;; ;; 	 (data   (cdr data-in)))
-;;; ;;     (print "dbname: " dbname " acfg: " acfg " rdat: " (calldat->alist rdat) " hdat: " hdat " data: " data)
-;;; ;;     (print "dbdat: " (dbdat->alist dbdat))
-;;; ;;     (apply sqlite3:execute sth data)
-;;; ;;     (save-dbh acfg dbname dbdat)
-;;; ;;     #t
-;;; ;;     ))
-;;; 
-;;; (define (finalize-all-db-handles acfg)
-;;;   (let* ((dbhandles (area-dbhandles acfg))  ;; dbhandles is hash of fname ==> dbdat
-;;; 	 (num       0))
-;;;     (for-each
-;;;      (lambda (area-name)
-;;;        (print "Closing handles for " area-name)
-;;;        (let ((dbdats (hash-table-ref/default dbhandles area-name '())))
-;;; 	 (for-each
-;;; 	  (lambda (dbdat)
-;;; 	    ;; first close all statement handles
-;;; 	    (for-each
-;;; 	     (lambda (sth)
-;;; 	       (sqlite3:finalize! sth)
-;;; 	       (set! num (+ num 1)))
-;;; 	     (hash-table-values (dbdat-sths dbdat)))
-;;; 	    ;; now close the dbh
-;;; 	    (set! num (+ num 1))
-;;; 	    (sqlite3:finalize! (dbdat-dbh dbdat)))
-;;; 	  dbdats)))
-;;;      (hash-table-keys dbhandles))
-;;;     (print "FINALIZED " num " dbhandles")))
-;;; 
-;;; ;;======================================================================
-;;; ;; W O R K   Q U E U E   H A N D L I N G 
-;;; ;;======================================================================
-;;; 
-;;; (define (register-db-as-mine acfg dbname)
-;;;   (let ((ht (area-dbs acfg)))
-;;;     (if (not (hash-table-ref/default ht dbname #f))
-;;; 	(hash-table-set! ht dbname (random 10000)))))
-;;; 	
-;;; (define (work-queue-add acfg fname witem)
-;;;   (let* ((work-queue-start (current-milliseconds))
-;;; 	 (action           (witem-action witem)) ;; NB the action is the index into the rdat actions
-;;; 	 (qdat             (or (hash-table-ref/default (area-wqueues acfg) fname #f)
-;;; 			       (let ((newqdat (make-qdat)))
-;;; 				 (hash-table-set! (area-wqueues acfg) fname newqdat)
-;;; 				 newqdat)))
-;;; 	 (rdat             (hash-table-ref/default (area-rtable acfg) action #f)))
-;;;     (if rdat
-;;; 	(queue-add!
-;;; 	 (case (calldat-ctype rdat)
-;;; 	   ((dbwrite)   (register-db-as-mine acfg fname)(qdat-writeq qdat))
-;;; 	   ((dbread)    (register-db-as-mine acfg fname)(qdat-readq  qdat))
-;;; 	   ((dbrw)      (register-db-as-mine acfg fname)(qdat-rwq    qdat))
-;;; 	   ((oslong)    (qdat-oslong qdat))
-;;; 	   ((osshort)   (qdat-osshort qdat))
-;;; 	   ((full-ping) (qdat-misc  qdat))
-;;; 	   (else
-;;; 	    (print "ERROR: no queue for " action ". Adding to dbwrite queue.")
-;;; 	    (qdat-writeq qdat)))
-;;; 	 witem)
-;;; 	(case action
-;;; 	  ((full-ping)(qdat-misc qdat))
-;;; 	  (else
-;;; 	   (print "ERROR: No action " action " was registered"))))
-;;;     (sdbg> "work-queue-add" "queue-add" work-queue-start #f #f)
-;;;     #t)) ;; for now, simply return #t to indicate request got to the queue
-;;; 
-;;; (define (doqueue acfg q fname dbdat dbh)
-;;;   ;; (print "doqueue: " fname)
-;;;   (let* ((start-time (current-milliseconds))
-;;; 	 (qlen       (queue-length q)))
-;;;     (if (> qlen 1)
-;;; 	(print "Processing queue of length " qlen))
-;;;     (let loop ((count      0)
-;;; 	       (responses '()))
-;;;       (let ((delta (- (current-milliseconds) start-time)))
-;;; 	(if (or (queue-empty? q)
-;;; 		(> delta 400)) ;; stop working on this queue after 400ms have passed
-;;; 	    (list count delta responses) ;; return count, delta and responses list
-;;; 	    (let* ((witem  (queue-remove! q))
-;;; 		   (action (witem-action witem))
-;;; 		   (rdat   (witem-rdat   witem))
-;;; 		   (stmt   (calldat-obj rdat))
-;;; 		   (sth    (full-get-sth acfg fname stmt))
-;;; 		   (ctype  (calldat-ctype rdat))
-;;; 		   (data   (witem-data   witem))
-;;; 		   (cookie (witem-cookie witem)))
-;;; 	      ;; do the processing and save the result in witem-result
-;;; 	      (witem-result-set!
-;;; 	       witem
-;;; 	       (case ctype ;; action
-;;; 		 ((noblockwrite) ;; blind write, no ack of success returned
-;;; 		  (apply sqlite3:execute sth data)
-;;; 		  (sqlite3:last-insert-rowid dbh))
-;;; 		 ((dbwrite)      ;; blocking write   
-;;; 		  (apply sqlite3:execute sth data)
-;;; 		  #t)
-;;; 		 ((dbread) ;; TODO: consider breaking this up and shipping in pieces for large query
-;;; 		  (apply sqlite3:map-row (lambda x x) sth data))
-;;; 		 ((full-ping)  'full-ping)
-;;; 		 (else (print "Not ready for action " action) #f)))
-;;; 	      (loop (add1 count)
-;;; 		    (if cookie
-;;; 			(cons witem responses)
-;;; 			responses))))))))
-;;; 
-;;; ;; do up to 400ms of processing on each queue
-;;; ;; - the work-queue-processor will allow the max 1200ms of work to complete but it will flag as overloaded
-;;; ;; 
-;;; (define (process-db-queries acfg fname)
-;;;   (if (hash-table-exists? (area-wqueues acfg) fname)
-;;;       (let* ((process-db-queries-start-time (current-milliseconds))
-;;; 	     (qdat             (hash-table-ref/default (area-wqueues acfg) fname #f))
-;;; 	     (queue-sym->queue (lambda (queue-sym)
-;;; 				 (case queue-sym  ;; lookup the queue from qdat given a name (symbol)
-;;; 				   ((wqueue)  (qdat-writeq qdat))
-;;; 				   ((rqueue)  (qdat-readq  qdat))
-;;; 				   ((rwqueue) (qdat-rwq    qdat))
-;;; 				   ((misc)    (qdat-misc   qdat))
-;;; 				   (else #f))))
-;;; 	     (dbdat   (get-dbh acfg fname))
-;;; 	     (dbh     (if (dbdat? dbdat)(dbdat-dbh dbdat) #f))
-;;; 	     (nowtime (current-seconds)))
-;;; 	;; handle the queues that require a transaction
-;;; 	;;
-;;; 	(map ;; 
-;;; 	 (lambda (queue-sym)
-;;; 	   ;; (print "processing queue " queue-sym)
-;;; 	   (let* ((queue (queue-sym->queue queue-sym)))
-;;; 	     (if (not (queue-empty? queue))
-;;; 		 (let ((responses
-;;; 			(sqlite3:with-transaction ;; todo - catch exceptions...
-;;; 			 dbh
-;;; 			 (lambda ()
-;;; 			   (let* ((res (doqueue acfg queue fname dbdat dbh))) ;; this does the work!
-;;; 			     ;; (print "res=" res)
-;;; 			     (match res
-;;; 			      ((count delta responses)
-;;; 			       (update-stats acfg fname queue-sym delta count)
-;;; 			       (sdbg> "process-db-queries" "sqlite3-transaction" process-db-queries-start-time #f #f)
-;;; 			       responses) ;; return responses
-;;; 			      (else
-;;; 			       (print "ERROR: bad return data from doqueue " res)))
-;;; 			     )))))
-;;; 		   ;; having completed the transaction, send the responses.
-;;; 		   ;; (print "INFO: sending " (length responses) " responses.")
-;;; 		   (let loop ((responses-left responses))
-;;; 		     (cond
-;;; 		      ((null? responses-left)  #t)
-;;; 		      (else
-;;; 		       (let* ((witem    (car responses-left))
-;;; 			      (response (cdr responses-left)))  
-;;; 			 (call-deliver-response acfg (witem-ripaddr witem)(witem-rport witem)
-;;; 						(witem-cookie witem)(witem-result witem)))
-;;; 		       (loop (cdr responses-left))))))
-;;; 		 )))
-;;; 	 '(wqueue rwqueue rqueue))
-;;; 	
-;;; 	;; handle misc queue
-;;; 	;;
-;;; 	;; (print "processing misc queue")
-;;; 	(let ((queue (queue-sym->queue 'misc)))
-;;; 	  (doqueue acfg queue fname dbdat dbh))
-;;; 	;; ....
-;;; 	(save-dbh acfg fname dbdat)
-;;; 	#t ;; just to let the tests know we got here
-;;; 	)
-;;;       #f ;; nothing processed
-;;;       ))
-;;; 
-;;; ;; run all queues in parallel per db but sequentially per queue for that db.
-;;; ;;  - process the queues every 500 or so ms
-;;; ;;  - allow for long running queries to continue but all other activities for that
-;;; ;;    db will be blocked.
-;;; ;;
-;;; (define (work-queue-processor acfg)
-;;;   (let* ((threads (make-hash-table))) ;; fname => thread
-;;;     (let loop ((fnames      (hash-table-keys (area-wqueues acfg)))
-;;; 	       (target-time (+ (current-milliseconds) 50)))
-;;;       ;;(if (not (null? fnames))(print "Processing for these databases: " fnames))
-;;;       (for-each
-;;;        (lambda (fname)
-;;; 	 ;; (print "processing for " fname)
-;;; 	 ;;(process-db-queries acfg fname))
-;;; 	 (let ((th (hash-table-ref/default threads fname #f)))
-;;; 	   (if (and th (not (member (thread-state th) '(dead terminated))))
-;;; 	       (begin
-;;; 		 (print "WARNING: worker thread for " fname " is taking a long time.")
-;;; 		 (print "Thread is in state " (thread-state th)))
-;;; 	       (let ((th1 (make-thread (lambda ()
-;;; 					 ;; (catch-and-dump
-;;; 					 ;;  (lambda ()
-;;; 					    ;; (print "Process queries for " fname)
-;;; 					    (let ((start-time (current-milliseconds)))
-;;; 					      (process-db-queries acfg fname)
-;;; 					      ;; (thread-sleep! 0.01) ;; need the thread to take at least some time
-;;; 					      (hash-table-delete! threads fname)) ;; no mutexes?
-;;; 					    fname)
-;;; 					  "th1"))) ;; ))
-;;; 		 (hash-table-set! threads fname th1)
-;;; 		 (thread-start! th1)))))
-;;;        fnames)
-;;;       ;; (thread-sleep! 0.1) ;; give the threads some time to process requests
-;;;       ;; burn time until 400ms is up
-;;;       (let ((now-time (current-milliseconds)))
-;;; 	(if (< now-time target-time)
-;;; 	    (let ((delta (- target-time now-time)))
-;;; 	      (thread-sleep! (/ delta 1000)))))
-;;;       (loop (hash-table-keys (area-wqueues acfg))
-;;; 	    (+ (current-milliseconds) 50)))))
-;;; 
-;;; ;;======================================================================
-;;; ;; S T A T S   G A T H E R I N G
-;;; ;;======================================================================
-;;; 
-;;; (defstruct stat
-;;;   (qcount-avg  0)                  ;; coarse running average
-;;;   (qtime-avg   0)                  ;; coarse running average
-;;;   (qcount      0)                  ;; total
-;;;   (qtime       0)                  ;; total
-;;;   (last-qcount 0)                  ;; last 
-;;;   (last-qtime  0)                  ;; last
-;;;   (dbs        '())                 ;; list of db files handled by this node
-;;;   (when        0))                 ;; when the last query happened - seconds
-;;; 
-;;; 
-;;; (define (update-stats acfg fname bucket duration numqueries)
-;;;   (let* ((key   fname) ;; for now do not use bucket. Was: (conc fname "-" bucket)) ;; lazy but good enough
-;;; 	 (stats (or (hash-table-ref/default (area-stats acfg) key #f)
-;;; 		    (let ((newstats (make-stat)))
-;;; 		      (hash-table-set! (area-stats acfg) key newstats)
-;;; 		      newstats))))
-;;;     ;; when the last query happended (used to remove the fname from the active list)
-;;;     (stat-when-set! stats (current-seconds))
-;;;     ;; last values
-;;;     (stat-last-qcount-set! stats numqueries)
-;;;     (stat-last-qtime-set!  stats duration)
-;;;     ;; total over process lifetime
-;;;     (stat-qcount-set! stats (+ (stat-qcount stats) numqueries))
-;;;     (stat-qtime-set!  stats (+ (stat-qtime  stats) duration))
-;;;     ;; coarse average
-;;;     (stat-qcount-avg-set! stats (/ (+ (stat-qcount-avg stats) numqueries) 2))
-;;;     (stat-qtime-avg-set!  stats (/ (+ (stat-qtime-avg  stats) duration)   2))
-;;; 
-;;;     ;; here is where we add the stats for a given dbfile
-;;;     (if (not (member fname (stat-dbs stats)))
-;;; 	(stat-dbs-set! stats (cons fname (stat-dbs stats))))
-;;; 
-;;;     ))
-;;; 
-;;; ;;======================================================================
-;;; ;; S E R V E R   S T U F F 
-;;; ;;======================================================================
-;;; 
-;;; ;; this does NOT return!
-;;; ;;
-;;; (define (find-free-port-and-open acfg)
-;;;   (let ((port (or (area-port acfg) 3200)))
-;;;     (handle-exceptions
-;;; 	exn
-;;; 	(begin
-;;; 	  (print "INFO: cannot bind to port " (rpc:default-server-port) ", trying next port")
-;;; 	  (area-port-set! acfg (+ port 1))
-;;; 	  (find-free-port-and-open acfg))
-;;;       (rpc:default-server-port port)
-;;;       (area-port-set! acfg port)
-;;;       (tcp-read-timeout 120000)
-;;;       ;; ((rpc:make-server (tcp-listen port)) #t)
-;;;       (tcp-listen (rpc:default-server-port)
-;;;       ))))
-;;; 
-;;; ;; register this node by putting a packet into the pkts dir.
-;;; ;; look for other servers
-;;; ;; contact other servers and compile list of servers
-;;; ;; there are two types of server
-;;; ;;     main servers - dashboards, runners and dedicated servers - need pkt
-;;; ;;     passive servers - test executers, step calls, list-runs - no pkt
-;;; ;;
-;;; (define (register-node acfg hostip port-num)
-;;;   ;;(mutex-lock! (area-mutex acfg))
-;;;   (let* ((server-type  (area-server-type acfg)) ;; auto, main, passive (no pkt created)
-;;; 	 (best-ip      (or hostip (get-my-best-address)))
-;;; 	 (mtdir        (area-dbdir acfg))
-;;; 	 (pktdir       (area-pktsdir acfg))) ;; conc mtdir "/.server-pkts")))
-;;;     (print "Registering node " best-ip ":" port-num)
-;;;     (if (not mtdir) ;; require a home for this node to put or find databases
-;;; 	#f
-;;; 	(begin
-;;; 	  (if  (not (directory? pktdir))(create-directory pktdir))
-;;; 	  ;; server is started, now create pkt if needed
-;;; 	  (print "Starting server in " server-type " mode with port " port-num)
-;;; 	  (if (member server-type '(auto main)) ;; TODO: if auto, count number of servers registers, if > 3 then don't put out a pkt
-;;; 	      (begin
-;;; 		(area-pktid-set! acfg
-;;; 				 (write-alist->pkt
-;;; 				  pktdir 
-;;; 				  `((hostname . ,(get-host-name))
-;;; 				    (ipaddr   . ,best-ip)
-;;; 				    (port     . ,port-num)
-;;; 				    (pid      . ,(current-process-id)))
-;;; 				  pktspec: *pktspec*
-;;; 				  ptype:   'server))
-;;; 		(area-pktfile-set! acfg (conc pktdir "/" (area-pktid acfg) ".pkt"))))
-;;; 	  (area-port-set!    acfg port-num)
-;;; 	  #;(mutex-unlock! (area-mutex acfg))))))
-;;; 
-;;; (define *cookie-seqnum* 0)
-;;; (define (make-cookie key)
-;;;   (set! *cookie-seqnum* (add1 *cookie-seqnum*))
-;;;   ;;(print "MAKE COOKIE CALLED -- on "servkey"-"*cookie-seqnum*)
-;;;   (conc key "-" *cookie-seqnum*)
-;;;   )
-;;; 
-;;; ;; dispatch locally if possible
-;;; ;;
-;;; (define (call-deliver-response acfg ipaddr port cookie data)
-;;;   (if (and (equal? (area-myaddr acfg) ipaddr)
-;;; 	   (equal? (area-port     acfg) port))
-;;;       (deliver-response acfg cookie data)
-;;;       ((rpc:procedure 'response ipaddr port) cookie data)))
-;;; 
-;;; (define (deliver-response acfg cookie data)
-;;;   (let ((deliver-response-start (current-milliseconds)))
-;;;     (thread-start! (make-thread
-;;; 		    (lambda ()
-;;; 		      (let loop ((tries-left 5))
-;;; 			;;(print "TOP OF DELIVER_RESPONSE LOOP; triesleft="tries-left)
-;;; 			;;(pp (hash-table->alist (area-cookie2mbox acfg)))
-;;; 			(let* ((mbox (hash-table-ref/default (area-cookie2mbox acfg) cookie #f)))
-;;; 			  (cond
-;;; 			   ((eq? 0 tries-left)
-;;; 			    (print "ulex:deliver-response: I give up. Mailbox never appeared. cookie="cookie)
-;;; 			    )
-;;; 			   (mbox
-;;; 			    ;;(print "got mbox="mbox"  got data="data"  send.")
-;;; 			    (mailbox-send! mbox data))
-;;; 			   (else
-;;; 			    ;;(print "no mbox yet.  look for "cookie)
-;;; 			    (thread-sleep! (/ (- 6 tries-left) 10))
-;;; 			    (loop (sub1 tries-left))))))
-;;; 		      ;; (debug-pp (list (conc "ulex:deliver-response took " (- (current-milliseconds) deliver-response-start) " ms, cookie=" cookie " data=") data))
-;;; 		      (sdbg> "deliver-response" "mailbox-send" deliver-response-start #f #f cookie)
-;;; 		      )
-;;; 		    (conc "deliver-response thread for cookie="cookie))))
-;;;   #t)
-;;; 
-;;; ;; action:
-;;; ;;   immediate - quick actions, no need to put in queues
-;;; ;;   dbwrite   - put in dbwrite queue
-;;; ;;   dbread    - put in dbread queue
-;;; ;;   oslong    - os actions, e.g. du, that could take a long time
-;;; ;;   osshort   - os actions that should be quick, e.g. df
-;;; ;;
-;;; (define (request acfg from-ipaddr from-port servkey action cookie fname params) ;; std-peer-handler
-;;;   ;; NOTE: Use rpc:current-peer for getting return address
-;;;   (let* ((std-peer-handler-start (current-milliseconds))
-;;; 	 ;; (raw-data               (alist-ref 'data     dat))
-;;; 	 (rdat                   (hash-table-ref/default
-;;; 				  (area-rtable acfg) action #f)) ;; this looks up the sql query or other details indexed by the action
-;;; 	 (witem                  (make-witem ripaddr: from-ipaddr ;; rhost:   from-host   
-;;; 					     rport:   from-port   action:  action
-;;; 					     rdat:    rdat        cookie:  cookie
-;;; 					     servkey: servkey     data:    params ;; TODO - rename data to params
-;;; 					     caller:  (rpc:current-peer))))
-;;;     (if (not (equal? servkey (area-pktid acfg)))
-;;; 	`(#f . ,(conc "I don't know you servkey=" servkey ", pktid=" (area-pktid acfg))) ;; immediately return this
-;;; 	(let* ((ctype (if rdat 
-;;; 			  (calldat-ctype rdat) ;; is this necessary? these should be identical
-;;; 			  action)))
-;;; 	  (sdbg> "std-peer-handler" "immediate" std-peer-handler-start #f #f)
-;;; 	  (case ctype
-;;; 	    ;; (dbwrite acfg rdat (cons from-ipaddr from-port) data)))
-;;; 	    ((full-ping)  `(#t  "ack to full ping"        ,(work-queue-add acfg fname witem) ,cookie))
-;;; 	    ((response)   `(#t  "ack from requestor"      ,(deliver-response acfg fname params)))
-;;; 	    ((dbwrite)    `(#t  "db write submitted"      ,(work-queue-add acfg fname witem) ,cookie))
-;;; 	    ((dbread)     `(#t  "db read submitted"       ,(work-queue-add acfg fname witem) ,cookie  ))
-;;; 	    ((dbrw)       `(#t  "db read/write submitted" ,cookie))
-;;; 	    ((osshort)    `(#t  "os short submitted"      ,cookie))
-;;; 	    ((oslong)     `(#t  "os long submitted"       ,cookie))
-;;; 	    (else         `(#f  "unrecognised action"     ,ctype)))))))
-;;; 
-;;; ;; Call this to start the actual server
-;;; ;;
-;;; ;; start_server
-;;; ;;
-;;; ;;   mode: '
-;;; ;;   handler: proc which takes pktrecieved as argument
-;;; ;;
-;;; 
-;;; (define (start-server acfg)
-;;;   (let* ((conn (find-free-port-and-open acfg))
-;;; 	 (port (area-port acfg)))
-;;;     (rpc:publish-procedure!
-;;;      'delist-db
-;;;      (lambda (fname)
-;;;        (hash-table-delete! (area-dbs acfg) fname)))
-;;;     (rpc:publish-procedure!
-;;;      'calling-addr
-;;;      (lambda ()
-;;;        (rpc:current-peer)))
-;;;     (rpc:publish-procedure!
-;;;      'ping
-;;;      (lambda ()(real-ping acfg)))
-;;;     (rpc:publish-procedure!
-;;;      'request
-;;;      (lambda (from-addr from-port servkey action cookie dbname params)
-;;;        (request acfg from-addr from-port servkey action cookie dbname params)))
-;;;     (rpc:publish-procedure!
-;;;      'response
-;;;      (lambda (cookie res-dat)
-;;;        (deliver-response acfg cookie res-dat)))
-;;;     (area-ready-set! acfg #t)
-;;;     (area-conn-set! acfg conn)
-;;;     ((rpc:make-server conn) #f)));; ((tcp-listen (rpc:default-server-port)) #t)
-;;; 
-;;; 
-;;; (define (launch acfg) ;;  #!optional (proc std-peer-handler))
-;;;   (print "starting launch")
-;;;   (update-known-servers acfg) ;; gotta do this on every start (thus why limit number of publicised servers)
-;;;   #;(let ((original-handler (current-exception-handler))) ;; is th
-;;;     (lambda (exception)
-;;;       (server-exit-procedure)
-;;;       (original-handler exception)))
-;;;   (on-exit (lambda ()
-;;; 	     (shutdown acfg))) ;; (finalize-all-db-handles acfg)))
-;;;   ;; set up the rpc handler
-;;;   (let* ((th1  (make-thread
-;;; 		(lambda ()(start-server acfg))
-;;; 		"server thread"))
-;;; 	 (th2   (make-thread
-;;; 		 (lambda ()
-;;; 		   (print "th2 starting")
-;;; 		   (let loop ()
-;;; 		     (work-queue-processor acfg)
-;;; 		     (print "work-queue-processor crashed!")
-;;; 		     (loop)))
-;;; 		 "work queue thread")))
-;;;     (thread-start! th1)
-;;;     (thread-start! th2)
-;;;     (let loop ()
-;;;       (thread-sleep! 0.025)
-;;;       (if (area-ready acfg)
-;;; 	  #t
-;;; 	  (loop)))
-;;;     ;; attempt to fix my address
-;;;     (let* ((all-addr (get-all-ips-sorted)))	     ;; could use (tcp-addresses conn)?
-;;;       (let loop ((rem-addrs all-addr))
-;;; 	(if (null? rem-addrs)
-;;; 	    (begin
-;;; 	      (print "ERROR: Failed to figure out the ip address of myself as a server. Giving up.")
-;;; 	      (exit 1)) ;; BUG Changeme to raising an exception
-;;; 		
-;;; 	    (let* ((addr      (car rem-addrs))
-;;; 		   (good-addr (handle-exceptions
-;;; 				  exn
-;;; 				  #f
-;;; 				((rpc:procedure 'calling-addr addr (area-port acfg))))))
-;;; 	      (if good-addr
-;;; 		  (begin
-;;; 		    (print "Got good-addr of " good-addr)
-;;; 		    (area-myaddr-set! acfg good-addr))
-;;; 		  (loop (cdr rem-addrs)))))))
-;;;     (register-node acfg (area-myaddr acfg)(area-port acfg))
-;;;     (print "INFO: Server started on " (area-myaddr acfg) ":" (area-port acfg))
-;;;     ;; (update-known-servers acfg) ;; gotta do this on every start (thus why limit number of publicised servers)
-;;;     ))
-;;; 
-;;; (define (clear-server-pkt acfg)
-;;;   (let ((pktf (area-pktfile acfg)))
-;;;     (if pktf (delete-file* pktf))))
-;;; 
-;;; (define (shutdown acfg)
-;;;   (let (;;(conn (area-conn    acfg))
-;;; 	(pktf (area-pktfile acfg))
-;;; 	(port (area-port    acfg)))
-;;;     (if pktf (delete-file* pktf))
-;;;     (send-all "imshuttingdown")
-;;;     ;; (rpc:close-all-connections!) ;; don't know if this is actually needed
-;;;     (finalize-all-db-handles acfg)))
-;;; 
-;;; (define (send-all msg)
-;;;   #f)
-;;; 
-;;; ;; given a area record look up all the packets
-;;; ;;
-;;; (define (get-all-server-pkts acfg)
-;;;   (let ((all-pkt-files (glob (conc (area-pktsdir acfg) "/*.pkt"))))
-;;;     (map (lambda (pkt-file)
-;;; 	   (read-pkt->alist pkt-file pktspec: *pktspec*))
-;;; 	 all-pkt-files)))
-;;; 
-;;; #;((Z . "9a0212302295a19610d5796fce0370fa130758e9")
-;;;   (port . "34827")
-;;;   (pid . "28748")
-;;;   (hostname . "zeus")
-;;;   (T . "server")
-;;;   (D . "1549427032.0"))
-;;; 
-;;; #;(define (get-my-best-address)
-;;;   (let ((all-my-addresses (get-all-ips))) ;; (vector->list (hostinfo-addresses (hostname->hostinfo (get-host-name))))))
-;;;     (cond
-;;;      ((null? all-my-addresses)
-;;;       (get-host-name))                                          ;; no interfaces?
-;;;      ((eq? (length all-my-addresses) 1)
-;;;       (ip->string (car all-my-addresses)))                      ;; only one to choose from, just go with it
-;;;      (else 
-;;;       (ip->string (car (filter (lambda (x)                      ;; take any but 127.
-;;; 				 (not (eq? (u8vector-ref x 0) 127)))
-;;; 			       all-my-addresses)))))))
-;;; 
-;;; ;; whoami? I am my pkt
-;;; ;;
-;;; (define (whoami? acfg)
-;;;   (hash-table-ref/default (area-hosts acfg)(area-pktid acfg) #f))
-;;; 
-;;; ;;======================================================================
-;;; ;; "Client side" operations
-;;; ;;======================================================================
-;;; 
-;;; (define (safe-call call-key host port . params)
-;;;   (handle-exceptions
-;;;    exn
-;;;    (begin
-;;;      (print "Call " call-key " to " host ":" port " failed")
-;;;      #f)
-;;;    (apply (rpc:procedure call-key host port) params)))
-;;; 
-;;; ;; ;; convert to/from string / sexpr
-;;; ;; 
-;;; ;; (define (string->sexpr str)
-;;; ;;   (if (string? str)
-;;; ;;       (with-input-from-string str read)
-;;; ;;       str))
-;;; ;; 
-;;; ;; (define (sexpr->string s)
-;;; ;;   (with-output-to-string (lambda ()(write s))))
-;;; 
-;;; ;; is the server alive?
-;;; ;;
-;;; (define (ping acfg host port)
-;;;   (let* ((myaddr     (area-myaddr acfg))
-;;; 	 (myport     (area-port   acfg))
-;;; 	 (start-time (current-milliseconds))
-;;; 	 (res        (if (and (equal? myaddr host)
-;;; 			      (equal? myport port))
-;;; 			 (real-ping acfg)
-;;; 			 ((rpc:procedure 'ping host port)))))
-;;;     (cons (- (current-milliseconds) start-time)
-;;; 	  res)))
-;;; 
-;;; ;; returns ( ipaddr port alist-fname=>randnum )
-;;; (define (real-ping acfg)
-;;;   `(,(area-myaddr acfg) ,(area-port acfg) ,(get-host-stats acfg)))
-;;; 
-;;; ;; is the server alive AND the queues processing?
-;;; ;;
-;;; #;(define (full-ping acfg servpkt)
-;;;   (let* ((start-time (current-milliseconds))
-;;; 	 (res        (send-message acfg servpkt '(full-ping) 'full-ping)))
-;;;     (cons (- (current-milliseconds) start-time)
-;;; 	  res))) ;; (equal? res "got ping"))))
-;;; 
-;;; 
-;;; ;; look up all pkts and get the server id (the hash), port, host/ip
-;;; ;; store this info in acfg
-;;; ;; return the number of responsive servers found
-;;; ;;
-;;; ;; DO NOT VERIFY THAT THE SERVER IS ALIVE HERE. This is called at times where the current server is not yet alive and cannot ping itself
-;;; ;;
-;;; (define (update-known-servers acfg)
-;;;   ;; readll all pkts
-;;;   ;; foreach pkt; if it isn't me ping the server; if alive, add to hosts hash, else rm the pkt
-;;;   (let* ((start-time (current-milliseconds))
-;;; 	 (all-pkts  (delete-duplicates
-;;; 		     (append (get-all-server-pkts acfg)
-;;; 			     (hash-table-values (area-hosts acfg)))))
-;;; 	 (hostshash (area-hosts acfg))
-;;; 	 (my-id     (area-pktid acfg))
-;;; 	 (pktsdir   (area-pktsdir acfg)) ;; needed to remove pkts from non-responsive servers
-;;; 	 (numsrvs   0)
-;;; 	 (delpkt    (lambda (pktsdir sid)
-;;; 		      (print "clearing out server " sid)
-;;; 		      (delete-file* (conc pktsdir "/" sid ".pkt"))
-;;; 		      (hash-table-delete! hostshash sid))))
-;;;     (area-last-srvup-set! acfg (current-seconds))
-;;;     (for-each
-;;;      (lambda (servpkt)
-;;;        (if (list? servpkt)
-;;; 	   ;; (pp servpkt)
-;;; 	   (let* ((shost (alist-ref 'ipaddr servpkt))
-;;; 		  (sport (any->number (alist-ref 'port servpkt)))
-;;; 		  (res   (handle-exceptions
-;;; 			  exn
-;;; 			  (begin
-;;; 			    ;; (print "INFO: bad server on " shost ":" sport)
-;;; 			    #f)
-;;; 			  (ping acfg shost sport)))
-;;; 		  (sid   (alist-ref 'Z servpkt)) ;; Z code is our name for the server
-;;; 		  (url   (conc shost ":" sport))
-;;; 		  )
-;;; 	     #;(if (or (not res)
-;;; 		     (null? res))
-;;; 		 (begin
-;;; 		   (print "STRANGE: ping of " url " gave " res)))
-;;; 	     
-;;; 	     ;; (print "Got " res " from " shost ":" sport)
-;;; 	     (match res
-;;; 		    ((qduration . payload)
-;;; 		     ;; (print "Server pkt:" (alist-ref 'ipaddr servpkt) ":" (alist-ref 'port servpkt)
-;;; 		     ;;        (if payload
-;;; 		     ;;            "Success" "Fail"))
-;;; 		     (match payload
-;;; 			    ((host port stats)
-;;; 			     ;; (print "From " host ":" port " got stats: " stats)
-;;; 			     (if (and host port stats)
-;;; 				 (let ((url (conc host ":" port)))
-;;; 				   (hash-table-set! hostshash sid servpkt)
-;;; 				   ;; store based on host:port
-;;; 				   (hash-table-set! (area-hoststats acfg) sid stats))
-;;; 				 (print "missing data from the server, not sure what that means!"))
-;;; 			     (set! numsrvs (+ numsrvs 1)))
-;;; 			    (#f
-;;; 			     (print "Removing pkt " sid " due to #f from server or failed ping")
-;;; 			     (delpkt pktsdir sid))
-;;; 			    (else
-;;; 			     (print "Got ")(pp res)(print " from server ")(pp servpkt) " but response did not match (#f/#t . msg)")))
-;;; 		    (else
-;;; 		     ;; here we delete the pkt - can't reach the server, remove it
-;;; 		     ;; however this logic is inadequate. we should mark the server as checked
-;;; 		     ;; and not good, if it happens a second time - then remove the pkt
-;;; 		     ;; or something similar. I.e. don't be too quick to assume the server is wedged or dead
-;;; 		     ;; could be it is simply too busy to reply
-;;; 		     (let ((bad-pings (hash-table-ref/default (area-health acfg) url 0)))
-;;; 		       (if (> bad-pings 1) ;; two bad pings - remove pkt
-;;; 			   (begin
-;;; 			     (print "INFO: " bad-pings " bad responses from " url ", deleting pkt " sid)
-;;; 			     (delpkt pktsdir sid))
-;;; 			   (begin
-;;; 			     (print "INFO: " bad-pings " bad responses from " shost ":" sport " not deleting pkt yet")
-;;; 			     (hash-table-set! (area-health acfg)
-;;; 					      url
-;;; 					      (+ (hash-table-ref/default (area-health acfg) url 0) 1))
-;;; 			     ))
-;;; 		       ))))
-;;; 	   ;; servpkt is not actually a pkt?
-;;; 	   (begin
-;;; 	     (print "Bad pkt " servpkt))))
-;;;      all-pkts)
-;;;     (sdbg> "update-known-servers" "end" start-time #f #f " found " numsrvs
-;;; 	   " servers, pkts: " (map (lambda (p)
-;;; 				     (alist-ref 'Z p))
-;;; 				   all-pkts))
-;;;     numsrvs))
-;;; 
-;;; (defstruct srvstat
-;;;   (numfiles 0)   ;; number of db files handled by this server - subtract 1 for the db being currently looked at
-;;;   (randnum  #f)  ;; tie breaker number assigned to by the server itself - applies only to the db under consideration
-;;;   (pkt      #f)) ;; the server pkt
-;;; 
-;;; ;;(define (srv->srvstat srvpkt)
-;;;   
-;;; ;; Get the server best for given dbname and key
-;;; ;;
-;;; ;;   NOTE: key is not currently used. The key points to the kind of query, this may be useful for directing read-only queries.
-;;; ;;
-;;; (define (get-best-server acfg dbname key)
-;;;   (let* (;; (servers (hash-table-values (area-hosts acfg)))
-;;; 	 (servers     (area-hosts acfg))
-;;; 	 (skeys       (sort (hash-table-keys servers) string>=?)) ;; a stable listing
-;;; 	 (start-time  (current-milliseconds))
-;;; 	 (srvstats    (make-hash-table))  ;; srvid => srvstat
-;;; 	 (url         (conc (area-myaddr acfg) ":" (area-port acfg))))
-;;;     ;; (print "scores for " dbname ": " (map (lambda (k)(cons k (calc-server-score acfg dbname k))) skeys))
-;;;     (if (null? skeys)
-;;; 	(if (> (update-known-servers acfg) 0)
-;;; 	    (get-best-server acfg dbname key) ;; some risk of infinite loop here, TODO add try counter
-;;; 	    (begin
-;;; 	      (print "ERROR: no server found!") ;; since this process is also a server this should never happen
-;;; 	      #f))
-;;; 	(begin
-;;; 	  ;; (print "in get-best-server with skeys=" skeys)
-;;; 	  (if (> (- (current-seconds) (area-last-srvup acfg)) 10)
-;;; 	      (begin
-;;; 		(update-known-servers acfg)
-;;; 		(sdbg> "get-best-server" "update-known-servers" start-time #f #f)))
-;;; 
-;;; 	  ;; for each server look at the list of dbfiles, total number of dbs being handled
-;;; 	  ;; and the rand number, save the best host
-;;; 	  ;; also do a delist-db for each server dbfile not used
-;;; 	  (let* ((best-server       #f)
-;;; 		 (servers-to-delist (make-hash-table)))
-;;; 	    (for-each
-;;; 	     (lambda (srvid)
-;;; 	       (let* ((server    (hash-table-ref/default servers srvid #f))
-;;; 		      (stats     (hash-table-ref/default (area-hoststats acfg) srvid '(()))))
-;;; 		 ;; (print "stats: " stats)
-;;;  		 (if server
-;;; 		     (let* ((dbweights (car stats))
-;;; 			    (srvload   (length (filter (lambda (x)(not (equal? dbname (car x)))) dbweights)))
-;;; 			    (dbrec     (alist-ref dbname dbweights equal?))  ;; get the pair with fname . randscore
-;;; 			    (randnum   (if dbrec
-;;; 					   dbrec ;; (cdr dbrec)
-;;; 					   0)))
-;;; 		       (hash-table-set! srvstats srvid (make-srvstat numfiles: srvload randnum: randnum pkt: server))))))
-;;; 	     skeys)
-;;; 	    
-;;; 	    (let* ((sorted    (sort (hash-table-values srvstats) 
-;;; 				    (lambda (a b)
-;;; 				      (let ((numfiles-a (srvstat-numfiles a))
-;;; 					    (numfiles-b (srvstat-numfiles b))
-;;; 					    (randnum-a  (srvstat-randnum a))
-;;; 					    (randnum-b  (srvstat-randnum b)))
-;;; 					(if (< numfiles-a numfiles-b) ;; Note, I don't think adding an offset works here. Goal was only move file handling to a different server if it has 2 less
-;;; 					    #t
-;;; 					    (if (and (equal? numfiles-a numfiles-b)
-;;; 						     (< randnum-a randnum-b))
-;;; 						#t
-;;; 						#f))))))
-;;; 		   (best      (if (null? sorted)
-;;; 				  (begin
-;;; 				    (print "ERROR: should never be null due to self as server.")
-;;; 				    #f)
-;;; 				  (srvstat-pkt (car sorted)))))
-;;; 	      #;(print "SERVER(" url "): " dbname ": " (map (lambda (srv)
-;;; 							    (let ((p (srvstat-pkt srv)))
-;;; 							      (conc (alist-ref 'ipaddr p) ":" (alist-ref 'port p)
-;;; 								    "(" (srvstat-numfiles srv)","(srvstat-randnum srv)")")))
-;;; 							    sorted))
-;;; 	      best))))))
-;;;     
-;;;     ;; send out an "I'm about to exit notice to all known servers"
-;;;     ;;
-;;; (define (death-imminent acfg)
-;;;   '())
-;;; 
-;;; ;;======================================================================
-;;; ;; U L E X  -  T H E   I N T E R E S T I N G   S T U F F ! !
-;;; ;;======================================================================
-;;; 
-;;; ;; register a handler
-;;; ;;   NOTES:
-;;; ;;     dbinitsql   is reserved for a list of sql statements for initializing the db
-;;; ;;     dbinitfn    is reserved for a db init function, if exists called after dbinitsql
-;;; ;;     
-;;; (define (register acfg key obj #!optional (ctype 'dbwrite))
-;;;   (let ((ht (area-rtable acfg)))
-;;;     (if (hash-table-exists? ht key)
-;;; 	(print "WARNING: redefinition of entry " key))
-;;;     (hash-table-set! ht key (make-calldat obj: obj ctype: ctype))))
-;;; 
-;;; ;; usage: register-batch acfg '((key1 . sql1) (key2 . sql2) ... )
-;;; ;; NB// obj is often an sql query
-;;; ;;
-;;; (define (register-batch acfg ctype data)
-;;;   (let ((ht (area-rtable acfg)))
-;;;     (map (lambda (dat)
-;;; 	   (hash-table-set! ht (car dat)(make-calldat obj: (cdr dat) ctype: ctype)))
-;;; 	 data)))
-;;; 
-;;; (define (initialize-area-calls-from-specfile area specfile)
-;;;   (let* ((callspec (with-input-from-file specfile read )))
-;;;     (for-each (lambda (group)
-;;;                 (register-batch
-;;;                  area
-;;;                  (car group)
-;;;                  (cdr group)))
-;;;               callspec)))
-;;; 
-;;; ;; get-rentry
-;;; ;;
-;;; (define (get-rentry acfg key)
-;;;   (hash-table-ref/default (area-rtable acfg) key #f))
-;;; 
-;;; (define (get-rsql acfg key)
-;;;   (let ((cdat (get-rentry acfg key)))
-;;;     (if cdat
-;;; 	(calldat-obj cdat)
-;;; 	#f)))
-;;; 
-;;; 
-;;; 
-;;; ;; blocking call:
-;;; ;;    client                         server
-;;; ;;    ------                         ------
-;;; ;;    call()
-;;; ;;    send-message()
-;;; ;;    nmsg-send()
-;;; ;;                                   nmsg-receive()
-;;; ;;                                   nmsg-respond(ack,cookie)
-;;; ;;    ack, cookie
-;;; ;;    mbox-thread-wait(cookie)
-;;; ;;                                   nmsg-send(client,cookie,result)
-;;; ;;        nmsg-respond(ack)
-;;; ;;        return result
-;;; ;;
-;;; ;; reserved action:
-;;; ;;    'immediate
-;;; ;;    'dbinitsql
-;;; ;;
-;;; (define (call acfg dbname action params #!optional (count 0))
-;;;   (let* ((call-start-time     (current-milliseconds))
-;;; 	 (srv                 (get-best-server acfg dbname action))
-;;; 	 (post-get-start-time (current-milliseconds))
-;;; 	 (rdat                (hash-table-ref/default (area-rtable acfg) action #f))
-;;; 	 (myid                (trim-pktid (area-pktid acfg)))
-;;; 	 (srvid               (trim-pktid (alist-ref 'Z srv)))
-;;; 	 (cookie              (make-cookie myid)))
-;;;     (sdbg> "call" "get-best-server" call-start-time #f call-start-time " from: " myid " to server: " srvid " for " dbname " action: " action " params: " params " rdat: " rdat)
-;;;     (print "INFO: call to " (alist-ref 'ipaddr srv) ":" (alist-ref 'port srv) " from " (area-myaddr acfg) ":" (area-port acfg) " for " dbname)
-;;;     (if (and srv rdat) ;; need both to dispatch a request
-;;; 	(let* ((ripaddr  (alist-ref 'ipaddr srv))
-;;; 	       (rsrvid   (alist-ref 'Z srv))
-;;; 	       (rport    (any->number (alist-ref 'port   srv)))
-;;; 	       (res-full (if (and (equal? ripaddr (area-myaddr acfg))
-;;; 				  (equal? rport   (area-port acfg)))
-;;; 			     (request acfg ripaddr rport (area-pktid acfg) action cookie dbname params)
-;;; 			     (safe-call 'request ripaddr rport
-;;; 					(area-myaddr acfg)
-;;; 					(area-port   acfg)
-;;; 					#;(area-pktid acfg)
-;;; 					rsrvid
-;;; 					action cookie dbname params))))
-;;; 	  ;; (print "res-full: " res-full)
-;;; 	  (match res-full
-;;; 	    ((response-ok response-msg rem ...)
-;;; 	     (let* ((send-message-time (current-milliseconds))
-;;; 		    ;; (match res-full
-;;; 		    ;;  ((response-ok response-msg)
-;;; 		    ;; (response-ok  (car res-full))
-;;; 		    ;; (response-msg (cadr res-full)
-;;; 		    )
-;;; 	       ;; (res (take res-full 3))) ;; ctype == action, TODO: converge on one term <<=== what was this? BUG 
-;;; 	       ;; (print "ulex:call: send-message took " (- send-message-time post-get-start-time) " ms params=" params)
-;;; 	       (sdbg> "call" "send-message" post-get-start-time #f call-start-time)
-;;; 	       (cond
-;;; 		((not response-ok) #f)
-;;; 		((member response-msg '("db read submitted" "db write submitted"))
-;;; 		 (let* ((cookie-id   (cadddr res-full))
-;;; 			(mbox        (make-mailbox))
-;;; 			(mbox-time   (current-milliseconds)))
-;;; 		   (hash-table-set! (area-cookie2mbox acfg) cookie-id mbox)
-;;; 		   (let* ((mbox-timeout-secs    20)
-;;; 			  (mbox-timeout-result 'MBOX_TIMEOUT)
-;;; 			  (res                  (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result))
-;;; 			  (mbox-receive-time    (current-milliseconds)))
-;;; 		     (hash-table-delete! (area-cookie2mbox acfg) cookie-id)
-;;; 		     (sdbg> "call" "mailbox-receive" mbox-time #f call-start-time " from: " myid " to server: " srvid " for " dbname)
-;;; 		     ;; (print "ulex:call mailbox-receive took " (- mbox-receive-time mbox-time) "ms params=" params)
-;;; 		     res)))
-;;; 		(else
-;;; 		 (print "Unhandled response \""response-msg"\"")
-;;; 		 #f))
-;;; 	       ;; depending on what action (i.e. ctype) is we will block here waiting for
-;;; 	       ;; all the data (mechanism to be determined)
-;;; 	       ;;
-;;; 	       ;; if res is a "working on it" then wait
-;;; 	       ;;    wait for result
-;;; 	       ;; mailbox thread wait on 
-;;; 	       
-;;; 	       ;; if res is a "can't help you" then try a different server
-;;; 	       ;; if res is a "ack" (e.g. for one-shot requests) then return res
-;;; 	       ))
-;;; 	    (else
-;;; 	     (if (< count 10)
-;;; 		 (let* ((url (conc (alist-ref 'ipaddr srv) ":" (alist-ref 'port srv))))
-;;; 		   (thread-sleep! 1)
-;;; 		   (print "ERROR: Bad result from " url ", dbname: " dbname ", action: " action ", params: " params ". Trying again in 1 second.")
-;;; 		   (call acfg dbname action params (+ count 1)))
-;;; 		 (begin
-;;; 		   (error (conc "ERROR: " count " tries, still have improper response res-full=" res-full)))))))
-;;; 	(begin
-;;; 	  (if (not rdat)
-;;; 	      (print "ERROR: action " action " not registered.")
-;;; 	      (if (< count 10)
-;;; 		 (begin
-;;; 		   (thread-sleep! 1)
-;;; 		   (area-hosts-set! acfg (make-hash-table)) ;; clear out all known hosts
-;;; 		   (print "ERROR: no server found, srv=" srv ", trying again in 1 seconds")
-;;; 		   (call acfg dbname action params (+ count 1)))
-;;; 		 (begin
-;;; 		   (error (conc "ERROR: no server found after 10 tries, srv=" srv ", giving up."))
-;;; 		   #;(error "No server available"))))))))
-;;; 
-;;; 
-;;; ;;======================================================================
-;;; ;; U T I L I T I E S 
-;;; ;;======================================================================
-;;; 
-;;; ;; get a signature for identifing this process
-;;; ;;
-;;; (define (get-process-signature)
-;;;   (cons (get-host-name)(current-process-id)))
-;;; 
-;;; ;;======================================================================
-;;; ;; S Y S T E M   S T U F F
-;;; ;;======================================================================
-;;; 
-;;; ;; get normalized cpu load by reading from /proc/loadavg and
-;;; ;; /proc/cpuinfo return all three values and the number of real cpus
-;;; ;; and the number of threads returns alist '((adj-cpu-load
-;;; ;; . normalized-proc-load) ... etc.  keys: adj-proc-load,
-;;; ;; adj-core-load, 1m-load, 5m-load, 15m-load
-;;; ;;
-;;; (define (get-normalized-cpu-load)
-;;;   (let ((res (get-normalized-cpu-load-raw))
-;;; 	(default `((adj-proc-load . 2) ;; there is no right answer
-;;; 		   (adj-core-load . 2)
-;;; 		   (1m-load       . 2)
-;;; 		   (5m-load       . 0) ;; causes a large delta - thus causing default of throttling if stuff goes wrong
-;;; 		   (15m-load      . 0)
-;;; 		   (proc          . 1)
-;;; 		   (core          . 1)
-;;; 		   (phys          . 1)
-;;; 		   (error         . #t))))
-;;;     (cond
-;;;      ((and (list? res)
-;;; 	   (> (length res) 2))
-;;;       res)
-;;;      ((eq? res #f)   default) ;; add messages?
-;;;      ((eq? res #f) default)   ;; this would be the #eof
-;;;      (else default))))
-;;; 
-;;; (define (get-normalized-cpu-load-raw)
-;;;   (let* ((actual-host           (get-host-name))) ;; #f is localhost
-;;;     (let ((data  (append 
-;;; 		  (with-input-from-file "/proc/loadavg" read-lines)
-;;; 		  (with-input-from-file "/proc/cpuinfo" read-lines)
-;;; 		  (list "end")))
-;;; 	  (load-rx  (regexp "^([\\d\\.]+)\\s+([\\d\\.]+)\\s+([\\d\\.]+)\\s+.*$"))
-;;; 	  (proc-rx  (regexp "^processor\\s+:\\s+(\\d+)\\s*$"))
-;;; 	  (core-rx  (regexp "^core id\\s+:\\s+(\\d+)\\s*$"))
-;;; 	  (phys-rx  (regexp "^physical id\\s+:\\s+(\\d+)\\s*$"))
-;;; 	  (max-num  (lambda (p n)(max (string->number p) n))))
-;;;       ;; (print "data=" data)
-;;;       (if (null? data) ;; something went wrong
-;;; 	  #f
-;;; 	  (let loop ((hed      (car data))
-;;; 		     (tal      (cdr data))
-;;; 		     (loads    #f)
-;;; 		     (proc-num 0)  ;; processor includes threads
-;;; 		     (phys-num 0)  ;; physical chip on motherboard
-;;; 		     (core-num 0)) ;; core
-;;; 	    ;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num)
-;;; 	    (if (null? tal) ;; have all our data, calculate normalized load and return result
-;;; 		(let* ((act-proc (+ proc-num 1))
-;;; 		       (act-phys (+ phys-num 1))
-;;; 		       (act-core (+ core-num 1))
-;;; 		       (adj-proc-load (/ (car loads) act-proc))
-;;; 		       (adj-core-load (/ (car loads) act-core))
-;;; 		       (result
-;;; 			(append (list (cons 'adj-proc-load adj-proc-load)
-;;; 				      (cons 'adj-core-load adj-core-load))
-;;; 				(list (cons '1m-load (car loads))
-;;; 				      (cons '5m-load (cadr loads))
-;;; 				      (cons '15m-load (caddr loads)))
-;;; 				(list (cons 'proc act-proc)
-;;; 				      (cons 'core act-core)
-;;; 				      (cons 'phys act-phys)))))
-;;; 		  result)
-;;; 		(regex-case
-;;; 		    hed
-;;; 		  (load-rx  ( x l1 l5 l15 ) (loop (car tal)(cdr tal)(map string->number (list l1 l5 l15)) proc-num phys-num core-num))
-;;; 		  (proc-rx  ( x p         ) (loop (car tal)(cdr tal) loads           (max-num p proc-num) phys-num core-num))
-;;; 		  (phys-rx  ( x p         ) (loop (car tal)(cdr tal) loads           proc-num (max-num p phys-num) core-num))
-;;; 		  (core-rx  ( x c         ) (loop (car tal)(cdr tal) loads           proc-num phys-num (max-num c core-num)))
-;;; 		  (else 
-;;; 		   (begin
-;;; 		     ;; (print "NO MATCH: " hed)
-;;; 		     (loop (car tal)(cdr tal) loads proc-num phys-num core-num))))))))))
-;;; 
-;;; (define (get-host-stats acfg)
-;;;   (let ((stats-hash (area-stats acfg)))
-;;;     ;; use this opportunity to remove references to dbfiles which have not been accessed in a while
-;;;     (for-each
-;;;      (lambda (dbname)
-;;;        (let* ((stats       (hash-table-ref stats-hash dbname))
-;;; 	      (last-access (stat-when stats)))
-;;; 	 (if (and (> last-access 0)                             ;; if zero then there has been no access
-;;; 		  (> (- (current-seconds) last-access) 10))     ;; not used in ten seconds
-;;; 	     (begin
-;;; 	       (print "Removing " dbname " from stats list")
-;;; 	       (hash-table-delete! stats-hash dbname) ;; remove from stats hash
-;;; 	       (stat-dbs-set! stats (hash-table-keys stats))))))
-;;;      (hash-table-keys stats-hash))
-;;;     
-;;;     `(,(hash-table->alist (area-dbs acfg)) ;; dbname => randnum
-;;;       ,(map (lambda (dbname)  ;; dbname is the db name
-;;; 	      (cons dbname (stat-when (hash-table-ref stats-hash dbname))))
-;;; 	    (hash-table-keys stats-hash))
-;;;       (cpuload . ,(get-normalized-cpu-load)))))
-;;;     #;(stats   . ,(map (lambda (k) ;; create an alist from the stats data
-;;; 		       (cons k (stat->alist (hash-table-ref (area-stats acfg) k))))
-;;; 		     (hash-table-keys (area-stats acfg))))
-;;; 
-;;; #;(trace
-;;;  ;; assv
-;;;  ;; cdr
-;;;  ;; caar
-;;;  ;; ;; cdr
-;;;  ;; call
-;;;  ;; finalize-all-db-handles
-;;;  ;; get-all-server-pkts
-;;;  ;; get-normalized-cpu-load
-;;;  ;; get-normalized-cpu-load-raw
-;;;  ;; launch
-;;;  ;; nmsg-send
-;;;  ;; process-db-queries
-;;;  ;; receive-message
-;;;  ;; std-peer-handler
-;;;  ;; update-known-servers
-;;;  ;; work-queue-processor
-;;;  )
-;;; 
-;;; ;;======================================================================
-;;; ;; netutil
-;;; ;;   move this back to ulex-netutil.scm someday?
-;;; ;;======================================================================
-;;; 
-;;; ;; #include <stdio.h>
-;;; ;; #include <netinet/in.h>
-;;; ;; #include <string.h>
-;;; ;; #include <arpa/inet.h>
-;;; 
-;;; (foreign-declare "#include \"sys/types.h\"")
-;;; (foreign-declare "#include \"sys/socket.h\"")
-;;; (foreign-declare "#include \"ifaddrs.h\"")
-;;; (foreign-declare "#include \"arpa/inet.h\"")
-;;; 
-;;; ;; get IP addresses from ALL interfaces
-;;; (define get-all-ips
-;;;   (foreign-safe-lambda* scheme-object ()
-;;;     "
-;;; 
-;;; // from https://stackoverflow.com/questions/17909401/linux-c-get-default-interfaces-ip-address :
-;;; 
-;;; 
-;;;     C_word lst = C_SCHEME_END_OF_LIST, len, str, *a;
-;;; //    struct ifaddrs *ifa, *i;
-;;; //    struct sockaddr *sa;
-;;; 
-;;;     struct ifaddrs * ifAddrStruct = NULL;
-;;;     struct ifaddrs * ifa = NULL;
-;;;     void * tmpAddrPtr = NULL;
-;;; 
-;;;     if ( getifaddrs(&ifAddrStruct) != 0)
-;;;       C_return(C_SCHEME_FALSE);
-;;; 
-;;; //    for (i = ifa; i != NULL; i = i->ifa_next) {
-;;;     for (ifa = ifAddrStruct; ifa != NULL; ifa = ifa->ifa_next) {
-;;;         if (ifa->ifa_addr->sa_family==AF_INET) { // Check it is
-;;;             // a valid IPv4 address
-;;;             tmpAddrPtr = &((struct sockaddr_in *)ifa->ifa_addr)->sin_addr;
-;;;             char addressBuffer[INET_ADDRSTRLEN];
-;;;             inet_ntop(AF_INET, tmpAddrPtr, addressBuffer, INET_ADDRSTRLEN);
-;;; //            printf(\"%s IP Address %s\\n\", ifa->ifa_name, addressBuffer);
-;;;             len = strlen(addressBuffer);
-;;;             a = C_alloc(C_SIZEOF_PAIR + C_SIZEOF_STRING(len));
-;;;             str = C_string(&a, len, addressBuffer);
-;;;             lst = C_a_pair(&a, str, lst);
-;;;         } 
-;;; 
-;;; //        else if (ifa->ifa_addr->sa_family==AF_INET6) { // Check it is
-;;; //            // a valid IPv6 address
-;;; //            tmpAddrPtr = &((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
-;;; //            char addressBuffer[INET6_ADDRSTRLEN];
-;;; //            inet_ntop(AF_INET6, tmpAddrPtr, addressBuffer, INET6_ADDRSTRLEN);
-;;; ////            printf(\"%s IP Address %s\\n\", ifa->ifa_name, addressBuffer);
-;;; //            len = strlen(addressBuffer);
-;;; //            a = C_alloc(C_SIZEOF_PAIR + C_SIZEOF_STRING(len));
-;;; //            str = C_string(&a, len, addressBuffer);
-;;; //            lst = C_a_pair(&a, str, lst);
-;;; //       }
-;;; 
-;;; //       else {
-;;; //         printf(\" not an IPv4 address\\n\");
-;;; //       }
-;;; 
-;;;     }
-;;; 
-;;;     freeifaddrs(ifa);
-;;;     C_return(lst);
-;;; 
-;;; "))
-;;; 
-;;; ;; Change this to bias for addresses with a reasonable broadcast value?
-;;; ;;
-;;; (define (ip-pref-less? a b)
-;;;   (let* ((rate (lambda (ipstr)
-;;;                  (regex-case ipstr
-;;;                              ( "^127\\." _ 0 )
-;;;                              ( "^(10\\.0|192\\.168\\.)\\..*" _ 1 )
-;;;                              ( else 2 ) ))))
-;;;     (< (rate a) (rate b))))
-;;;   
-;;; 
-;;; (define (get-my-best-address)
-;;;   (let ((all-my-addresses (get-all-ips))
-;;;         ;;(all-my-addresses-old (vector->list (hostinfo-addresses (hostname->hostinfo (get-host-name)))))
-;;;         )
-;;;     (cond
-;;;      ((null? all-my-addresses)
-;;;       (get-host-name))                                          ;; no interfaces?
-;;;      ((eq? (length all-my-addresses) 1)
-;;;       (car all-my-addresses))                      ;; only one to choose from, just go with it
-;;;      
-;;;      (else
-;;;       (car (sort all-my-addresses ip-pref-less?)))
-;;;      ;; (else 
-;;;      ;;  (ip->string (car (filter (lambda (x)                      ;; take any but 127.
-;;;      ;;    			 (not (eq? (u8vector-ref x 0) 127)))
-;;;      ;;    		       all-my-addresses))))
-;;; 
-;;;      )))
-;;; 
-;;; (define (get-all-ips-sorted)
-;;;   (sort (get-all-ips) ip-pref-less?))
-;;; 
-;;; 
-
+  (map address-info-host
+       (filter (lambda (x)
+		 (equal? (address-info-type x) "tcp"))
+	       (address-infos (get-host-name)))))
+
+)

Index: utils/nbfake
==================================================================
--- utils/nbfake
+++ utils/nbfake
@@ -39,10 +39,11 @@
 nbfake behavior can be changed by setting the following env vars:
    NBFAKE_HOST       SSH to \$NBFAKE_HOST and run command
    NBFAKE_LOG        Logfile for nbfake output
    NB_WASH_GROUPS    comma-separated list of groups to wash into
    NB_WASH_ENABLED   must be set in order to enable wash groups
+   NBFAKE_QUIET      set to suppress informational output
 
 __EOF
   exit
 fi
 
@@ -87,19 +88,21 @@
 
 #==============================================================================
 # Run and log
 #==============================================================================
 
+if [[ -z "$NBFAKE_QUIET" ]];then
 cat <<__EOF >&2
 #======================================================================
 # NBFAKE logging command to: $MY_NBFAKE_LOG
 #     $WASHCMD $*
 #======================================================================
 __EOF
+fi
 
 if [[ -z "$MY_NBFAKE_HOST" ]]; then
   # Run locally
   sh -c "cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=\"$PATH\"; nohup $WASHCMD $* >> $MY_NBFAKE_LOG 2>&1 &"
 else
   # run remotely
   ssh -X -n -f $MY_NBFAKE_HOST "sh -c \"cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=\"$PATH\"; nohup $WASHCMD $* >> $MY_NBFAKE_LOG 2>&1 &\""
 fi