Index: Makefile
==================================================================
--- Makefile
+++ Makefile
@@ -15,31 +15,54 @@
 #     You should have received a copy of the GNU General Public License
 #     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 # make install CSCOPTS='-accumulate-profile -profile-name $(PWD)/profile-ww$(shell date +%V.%u)'
 # rm <files>.o ; make install CSCOPTS='-profile' ; ... ;  chicken-profile | less
+
+all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut 
+
+recent-commits.csv : .fslckout
+	fossil timeline -n 350 -t ci -F "%h,%a,%b,%t,\"%c\"" > recent-commits.csv
+
+
 SHELL=/bin/bash
 PREFIX=$(PWD)
 CSCOPTS=
 INSTALL=install
 SRCFILES = common.scm items.scm launch.scm ods.scm runconfig.scm	\
            server.scm configf.scm db.scm keys.scm margs.scm		\
            process.scm runs.scm tasks.scm tests.scm genexample.scm	\
-           http-transport.scm tdb.scm client.scm mt.scm	\
-           ezsteps.scm lock-queue.scm rmt.scm api.scm		\
-           subrun.scm portlogger.scm archive.scm env.scm		\
+           tdb.scm mt.scm	\
+           ezsteps.scm rmt.scm api.scm		\
+           subrun.scm archive.scm env.scm		\
            diff-report.scm cgisetup/models/pgdb.scm
 
 # module source files
-MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm
+MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm \
+            tcp-transportmod.scm rmtmod.scm portlogger.scm
 
-all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut tcmt
+transport-mode.scm : transport-mode.scm.template
+	cp transport-mode.scm.template transport-mode.scm
+
+dashboard-transport-mode.scm : dashboard-transport-mode.scm.template
+	cp dashboard-transport-mode.scm.template dashboard-transport-mode.scm
+
+megatest.scm : transport-mode.scm
+dashboard.scm : dashboard-transport-mode.scm
 
 # dbmod.import.o is just a hack here
-mofiles/dbfile.o     : mofiles/debugprint.o mofiles/commonmod.o dbmod.import.o
-db.o : dbmod.import.o
+mofiles/portlogger.o : mofiles/dbmod.o
+
+mofiles/dbfile.o     : \
+       mofiles/debugprint.o mofiles/commonmod.o
+
+configf.o : commonmod.import.o
+mofiles/dbfile.o : mofiles/debugprint.o
+mofiles/rmtmod.o mofiles/dbmod.o : mofiles/dbfile.o mofiles/commonmod.o mofiles/debugprint.o
+db.o : mofiles/dbmod.o mofiles/dbfile.o
 mofiles/debugprint.o : mofiles/mtargs.o
+mofiles/tcp-transportmod.o : mofiles/portlogger.o
 
 # ftail.scm rmtmod.scm commonmod.scm removed
 # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm	\
 #             mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm	\
 #             rmtmod.scm apimod.scm
@@ -63,13 +86,14 @@
 #	@[ -e mofiles ] || mkdir -p mofiles
 #	csc $(CSCOPTS) -I mofiles -I $* -J -c $< -o $*.o
 #	cp $*.o mofiles/$*.o
 #	@touch $*.import.scm # ensure it is touched after the .o is made
 
-mofiles/%.o : %.scm
-	mkdir -p mofiles
+%.import.scm mofiles/%.o : %.scm
+	@mkdir -p mofiles
 	csc $(CSCOPTS) -J -c $< -o mofiles/$*.o
+	@if [[ -e $*.import.scm ]];then touch $*.import.scm;fi # ensure it is touched after the .o is made
 
 ADTLSCR=mt_laststep mt_runstep mt_ezstep
 HELPERS=$(addprefix $(PREFIX)/bin/,$(ADTLSCR))
 DEPLOYHELPERS=$(addprefix deploytarg/,$(ADTLSCR))
 MTESTHASH=$(shell fossil info|grep checkout:| awk '{print $$2}')
@@ -96,49 +120,45 @@
 	@echo $(MTESTHASH)
 
 dboard : $(OFILES) $(GOFILES) dashboard.scm $(MOFILES) $(MOIMPFILES) megatest-version.scm megatest-fossil-hash.scm
 	csc $(CSCOPTS) $(OFILES) dashboard.scm $(GOFILES) $(MOFILES) $(MOIMPFILES) -o dboard
 
-mtut: $(OFILES) $(MOFILES) megatest-fossil-hash.scm mtut.scm megatest-version.scm
-	csc $(CSCOPTS) $(OFILES) $(MOFILES) mtut.scm -o mtut
+mtut: $(OFILES) $(MOFILES) $(MOIMPFILES) megatest-fossil-hash.scm mtut.scm megatest-version.scm 
+	csc $(CSCOPTS) $(OFILES) $(MOFILES)  $(MOIMPFILES) mtut.scm -o mtut
 
 # include makefile.inc
 
-TCMTOBJS = \
-	api.o \
-	archive.o \
-	cgisetup/models/pgdb.o \
-	client.o \
-	common.o \
-	configf.o \
-	db.o \
-	env.o \
-	http-transport.o \
-	items.o \
-	keys.o \
-	launch.o \
-	lock-queue.o \
-	margs.o \
-	mt.o \
-	ods.o \
-	portlogger.o \
-	process.o \
-	rmt.o \
-	runconfig.o \
-	runs.o \
-	server.o \
-	tasks.o \
-	tdb.o \
-	tests.o \
-	subrun.o \
-        ezsteps.o
-
-#        mofiles/rmtmod.o \
-#        mofiles/commonmod.o \
-
-tcmt : $(TCMTOBJS) tcmt.scm megatest-version.scm $(MOFILES) $(MOIMPFILES)
-	csc $(CSCOPTS) $(TCMTOBJS) $(MOFILES) $(MOIMPFILES) tcmt.scm -o tcmt
+# TCMTOBJS = \
+# 	api.o \
+# 	archive.o \
+# 	cgisetup/models/pgdb.o \
+# 	common.o \
+# 	configf.o \
+# 	db.o \
+# 	env.o \
+# 	items.o \
+# 	keys.o \
+# 	launch.o \
+# 	margs.o \
+# 	mt.o \
+# 	ods.o \
+# 	process.o \
+# 	rmt.o \
+# 	runconfig.o \
+# 	runs.o \
+# 	server.o \
+# 	tasks.o \
+# 	tdb.o \
+# 	tests.o \
+# 	subrun.o \
+#         ezsteps.o
+# 
+# #        mofiles/rmtmod.o \
+# #        mofiles/commonmod.o \
+# 
+# tcmt : $(TCMTOBJS) tcmt.scm megatest-version.scm $(MOFILES) $(MOIMPFILES)
+# 	csc $(CSCOPTS) $(TCMTOBJS) $(MOFILES) $(MOIMPFILES) tcmt.scm -o tcmt
 
 # install documentation to $(PREFIX)/docs
 # DOES NOT REBUILD DOCS
 #
 $(PREFIX)/share/docs/megatest_manual.html : docs/manual/megatest_manual.html
@@ -156,11 +176,11 @@
 	$(INSTALL) mt-pg.sql $(PREFIX)/share/db/mt-pg.sql
 
 # Special dependencies for the includes
 $(MOFILE) $(MOIMPFILES) : megatest-fossil-hash.scm
 
-mofiles/commonmod.o : megatest-fossil-hash.scm
+mofiles/commonmod.o : mofiles/debugprint.o megatest-fossil-hash.scm
 common.o : mofiles/commonmod.o
 
 # mofiles/dbmod.o : mofiles/configfmod.o
 
 # commonmod.o dashboard.o megatest.o tcmt.o apimod.o : megatest-fossil-hash.scm
@@ -176,16 +196,17 @@
 
 tests.o tasks.o dashboard-tasks.o : task_records.scm
 
 runs.o : test_records.scm
 
-mofiles-made : $(MOFILES)
-	make $(MOIMPFILES)
+# mofiles-made : $(MOFILES)
+# 	make $(MOIMPFILES)
+# 	touch mofiles-made
 
-megatest.o : mofiles-made megatest-fossil-hash.scm megatest-version.scm $(MOFILES) $(MOIMPFILES)
+megatest.o : megatest-fossil-hash.scm megatest-version.scm $(MOFILES) $(MOIMPFILES)
 
-rmt.scm client.scm common.scm configf.scm dashboard-guimonitor.scm dashboard-tests.scm dashboard.scm db.scm dcommon.scm ezsteps.scm http-transport.scm index-tree.scm items.scm keys.scm launch.scm megatest.scm monitor.scm mt.scm newdashboard.scm runconfig.scm runs.scm server.scm tdb.scm tests.scm tree.scm : common_records.scm megatest-version.scm
+rmt.scm common.scm configf.scm dashboard-guimonitor.scm dashboard-tests.scm dashboard.scm db.scm dcommon.scm ezsteps.scm index-tree.scm items.scm keys.scm launch.scm megatest.scm monitor.scm mt.scm newdashboard.scm runconfig.scm runs.scm server.scm tdb.scm tests.scm tree.scm : common_records.scm megatest-version.scm
 
 common_records.scm : altdb.scm
 
 mofiles/dbfile.o : mofiles/commonmod.o
 
@@ -260,16 +281,16 @@
 	utils/mk_wrapper $(PREFIX) mtexec $(PREFIX)/bin/mtexec
 	chmod a+x $(PREFIX)/bin/mtexec
 
 # tcmt
 
-$(PREFIX)/bin/.$(ARCHSTR)/tcmt : tcmt
-	$(INSTALL) tcmt $(PREFIX)/bin/.$(ARCHSTR)/tcmt
-
-$(PREFIX)/bin/tcmt : $(PREFIX)/bin/.$(ARCHSTR)/tcmt utils/mk_wrapper
-	utils/mk_wrapper $(PREFIX) tcmt $(PREFIX)/bin/tcmt
-	chmod a+x $(PREFIX)/bin/tcmt
+# $(PREFIX)/bin/.$(ARCHSTR)/tcmt : tcmt
+# 	$(INSTALL) tcmt $(PREFIX)/bin/.$(ARCHSTR)/tcmt
+# 
+# $(PREFIX)/bin/tcmt : $(PREFIX)/bin/.$(ARCHSTR)/tcmt utils/mk_wrapper
+# 	utils/mk_wrapper $(PREFIX) tcmt $(PREFIX)/bin/tcmt
+# 	chmod a+x $(PREFIX)/bin/tcmt
 
 $(PREFIX)/bin/mt_laststep : utils/mt_laststep
 	$(INSTALL) $< $@
 	chmod a+x $@
 
@@ -359,22 +380,22 @@
 	$(INSTALL) lib/libxcb-xlib.so.0 $(PREFIX)/bin/.$(ARCHSTR)/lib/libxcb-xlib.so.0; \
         fi
 
 install : $(PREFIX)/bin/.$(ARCHSTR) $(PREFIX)/bin/.$(ARCHSTR)/mtest $(PREFIX)/bin/megatest \
           $(PREFIX)/bin/.$(ARCHSTR)/dboard $(PREFIX)/bin/dashboard $(HELPERS) $(PREFIX)/bin/nbfake \
-          $(PREFIX)/bin/.$(ARCHSTR)/mtexec $(PREFIX)/bin/mtexec $(PREFIX)/bin/serialize-env \
 	  $(PREFIX)/bin/nbfind $(PREFIX)/bin/mtrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \
           $(PREFIX)/bin/mt-old-to-new.sh $(PREFIX)/bin/mt-new-to-old.sh \
 	  $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun \
 	  $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/mtutil \
-	  $(PREFIX)/bin/tcmt $(PREFIX)/share/db/mt-pg.sql \
+	  $(PREFIX)/share/db/mt-pg.sql \
           $(PREFIX)/share/js/jquery-3.1.0.slim.min.js \
           $(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so \
           $(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so.0 \
-          $(PREFIX)/bin/.$(ARCHSTR)/lib/libxcb-xlib.so.0
-#         $(PREFIX)/bin/.$(ARCHSTR)/ndboard
+          $(PREFIX)/bin/.$(ARCHSTR)/lib/libxcb-xlib.so.0 $(PREFIX)/bin/serialize-env
 
+#         $(PREFIX)/bin/.$(ARCHSTR)/ndboard $(PREFIX)/bin/tcmt 
+#         $(PREFIX)/bin/.$(ARCHSTR)/mtexec $(PREFIX)/bin/mtexec $(PREFIX)/bin/serialize-env \
 # $(PREFIX)/bin/newdashboard
 
 $(PREFIX)/bin/.$(ARCHSTR) : 
 	mkdir -p $(PREFIX)/bin/.$(ARCHSTR)
 	mkdir -p $(PREFIX)/bin/.$(ARCHSTR)/lib
@@ -471,35 +492,42 @@
 	fi
 
 altdb.scm :
 	echo ";; optional alternate db setup" > altdb.scm
 	echo "(define *available-db* (make-hash-table))" >> altdb.scm
-	if  csi -ne '(use mysql-client)';then \
+	if  csi -ne '(use mysql-client)' &> /dev/null;then \
            echo "(use mysql-client)(hash-table-set! *available-db* 'mysql #t)" >> altdb.scm; \
 	fi
-	if csi -ne '(use postgresql)';then \
+	if csi -ne '(use postgresql)'&> /dev/null;then \
 	   echo "(use postgresql)(hash-table-set! *available-db* 'postgresql #t)" >> altdb.scm;\
 	fi
+	if  csi -ne '(import mysql-client)'&> /dev/null;then \
+           echo "(import mysql-client)(hash-table-set! *available-db* 'mysql #t)" >> altdb.scm; \
+	fi
+	if csi -ne '(import postgresql)'&> /dev/null;then \
+	   echo "(import postgresql)(hash-table-set! *available-db* 'postgresql #t)" >> altdb.scm;\
+	fi
+
+# portlogger-example : portlogger-example.scm api.o archive.o common.o configf.o dashboard-tests.o dashboard-context-menu.o db.o dcommon.o ezsteps.o genexample.o gutils.o items.o keys.o launch.o lock-queue.o margs.o mt.o ods.o portlogger.o process.o rmt.o runconfig.o runs.o server.o synchash.o tasks.o tdb.o tests.o tree.o
+#	csc $(CSCOPTS) portlogger-example.scm api.o archive.o common.o configf.o dashboard-tests.o dashboard-context-menu.o db.o dcommon.o ezsteps.o genexample.o gutils.o items.o keys.o launch.o lock-queue.o margs.o mt.o ods.o portlogger.o process.o rmt.o runconfig.o runs.o server.o sync-hash.o tasks.o tdb.o tests.o tree.o
+
+unitdeps.dot : *scm ./utils/plot-uses Makefile
+	./utils/plot-uses todot commonmod.import,mtargs.import,mtargs,debugprint *.scm > unitdeps.dot
+
+unitdeps.pdf : unitdeps.dot
+	dot unitdeps.dot -Tpdf -o unitdeps.pdf
 
-portlogger-example : portlogger-example.scm api.o archive.o client.o common.o configf.o dashboard-tests.o dashboard-context-menu.o db.o dcommon.o ezsteps.o genexample.o gutils.o http-transport.o items.o keys.o launch.o lock-queue.o margs.o mt.o ods.o portlogger.o process.o rmt.o runconfig.o runs.o server.o synchash.o tasks.o tdb.o tests.o tree.o
-	csc $(CSCOPTS) portlogger-example.scm api.o archive.o client.o common.o configf.o dashboard-tests.o dashboard-context-menu.o db.o dcommon.o ezsteps.o genexample.o gutils.o http-transport.o items.o keys.o launch.o lock-queue.o margs.o mt.o ods.o portlogger.o process.o rmt.o runconfig.o runs.o server.o synchash.o tasks.o tdb.o tests.o tree.o
+./utils/plot-uses : utils/plot-uses.scm
+	csc utils/plot-uses.scm
 
 # create a pdf dot graphviz diagram from notations in rmt.scm
 rmt.pdf : rmt.scm
 	grep ';;DOT' rmt.scm | sed -e 's/.*;;DOT //' > rmt.dot;dot -Tpdf rmt.dot -o rmt.pdf
 
 buildmanual:
 	cd docs/manual && make
 
-wikipage=plan
-editwiki:
-	cd docs/manual && ../../utils/editwiki $(wikipage)
-
-viewmanual:
-	arora docs/manual/megatest_manual.html
-
 targets:
 	@grep : Makefile | perl -ne '/^([A-Za-z0-9_-]+):/ && print "$$1\n"'
-
 
 unit :
 	cd tests;make unit

Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,13 @@
 #     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 TODO
 ====
 
+23WW07
+. Remove use of *dbstruct-dbs*
+
 WW15
 . fill newview matrix with data, filter pipeline gui elements
 . improve [script], especially indent handling
 
 WW16

Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -1,7 +1,5 @@
-
-
 ;;======================================================================
 ;; Copyright 2006-2013, Matthew Welland.
 ;; 
 ;; This file is part of Megatest.
 ;; 
@@ -18,21 +16,30 @@
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 ;;======================================================================
 
-(use srfi-69 posix)
-
 (declare (unit api))
-(declare (uses rmt))
 (declare (uses db))
+(declare (uses debugprint))
+(declare (uses commonmod))
 (declare (uses dbmod))
 (declare (uses dbfile))
 (declare (uses tasks))
+(declare (uses tcp-transportmod))
 
+(import commonmod)
 (import dbmod)
 (import dbfile)
+(import debugprint)
+(import tcp-transportmod)
+
+(use srfi-69
+     srfi-18
+     posix
+     matchable
+     s11n)
 
 ;; allow these queries through without starting a server
 ;;
 (define api:read-only-queries
   '(get-key-val-pairs
@@ -39,10 +46,11 @@
     get-var
     get-keys
     get-key-vals
     test-toplevel-num-items
     get-test-info-by-id
+    get-test-state-status-by-id
     get-steps-info-by-id
     get-data-info-by-id
     test-get-rundir-from-test-id
     get-count-tests-running-for-testname
     get-count-tests-running
@@ -59,12 +67,12 @@
     get-run-info
     get-run-status
     get-run-state
     get-run-stats
     get-run-times
-    get-targets
     get-target
+    get-targets
     ;; register-run
     get-tests-tags
     get-test-times
     get-tests-for-run
     get-tests-for-run-state-status
@@ -86,11 +94,10 @@
     read-test-data-varpatt
     login
     tasks-get-last
     testmeta-get-record
     have-incompletes?
-    ;; synchash-get
     get-changed-record-ids
     get-run-record-ids 
     get-not-completed-cnt))
 
 (define api:write-queries
@@ -126,11 +133,11 @@
     ;; TEST DATA
     test-data-rollup
     csv->test-data
 
     ;; MISC
-    sync-inmem->db
+    sync-cachedb->db
     drop-all-triggers
     create-all-triggers
     update-tesdata-on-repilcate-db 
 
     ;; TESTMETA
@@ -141,262 +148,335 @@
     tasks-add
     tasks-set-state-given-param-key
     ))
 
 (define *db-write-mutexes* (make-hash-table))
-
-;; These are called by the server on recipt of /api calls
-;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
-;;
-;;    - returns #( flag result )
-;;
-(define (api:execute-requests dbstruct dat)
-  (db:open-no-sync-db) ;; sets *no-sync-db*
-;;   (handle-exceptions
-;;    exn
-;;    (let ((call-chain (get-call-chain)))
-;;      (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn)
-;;      (print-call-chain (current-error-port))
-;;      (debug:print 0 *default-log-port* " message: "  ((condition-property-accessor 'exn 'message) exn))       
-  ;;      (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
-  (if (> *api-process-request-count* 200)
-      (begin
-	(if (common:low-noise-print 30 "too many threads")
-	    (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay."))
-	(thread-sleep! 0.5) ;; take a nap
-	))
-   (cond
-    ((not (vector? dat))                    ;; it is an error to not receive a vector
-     (vector #f (vector #f "remote must be called with a vector")))
-    #;((> *api-process-request-count* 200) ;; 20)
-     (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
-     (set! *server-overloaded* #t)
-     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
-    (else  
-     (let* ((cmd-in            (vector-ref dat 0))
-            (cmd               (if (symbol? cmd-in)
-				   cmd-in
-				   (string->symbol cmd-in)))
-            (params            (vector-ref dat 1))
-	    (run-id            (if (null? params)
-				   0
-				   (car params)))
-	    (write-mutex       (if (hash-table-exists? *db-write-mutexes* run-id)
-				   (hash-table-ref *db-write-mutexes* run-id)
-				   (let* ((newmutex (make-mutex)))
-				     (hash-table-set! *db-write-mutexes* run-id newmutex)
-				     newmutex)))
-            (start-t           (current-milliseconds))
-            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
-            (readonly-command  (member cmd api:read-only-queries))
-            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))))
-       (if (not readonly-command)
-	   (mutex-lock! write-mutex))
-       (let* ((res    
-               (if writecmd-in-readonly-mode
-                   (conc "attempt to run write command "cmd" on a read-only database")
-                   (case cmd
-                     ;;===============================================
-                     ;; READ/WRITE QUERIES
-                     ;;===============================================
-
-                     ((get-keys-write)                        (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl
-                     
-                     ;; SERVERS
-                     ((start-server)                    (apply server:kind-run params))
-                     ((kill-server)                     (set! *server-run* #f))
-
-                     ;; TESTS
-
-                     ;;((test-set-state-status-by-id)     (apply mt:test-set-state-status-by-id dbstruct params))
-                     ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items.
-                     ((test-set-state-status-by-id)
-
-                      ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
-                      (db:set-state-status-and-roll-up-items
-                       dbstruct
-                       (list-ref params 0) ; run-id
-                       (list-ref params 1) ; test-name
-                       #f                  ; item-path
-                       (list-ref params 2) ; state
-                       (list-ref params 3) ; status
-                       (list-ref params 4) ; comment
-                       ))
-                     
-                     ((delete-test-records)             (apply db:delete-test-records dbstruct params))
-                     ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params))
-                     ((test-set-state-status)           (apply db:test-set-state-status dbstruct params))
-                     ((test-set-top-process-pid)        (apply db:test-set-top-process-pid dbstruct params))
-                     ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params))
-                     ((set-state-status-and-roll-up-run) (apply db:set-state-status-and-roll-up-run dbstruct params)) 
-                     ((top-test-set-per-pf-counts)      (apply db:top-test-set-per-pf-counts dbstruct params))
-                     ((test-set-archive-block-id)       (apply db:test-set-archive-block-id dbstruct params))
-
-                     ;; RUNS
-                     ((register-run)                 (apply db:register-run dbstruct params))
-                     ((set-tests-state-status)       (apply db:set-tests-state-status dbstruct params))
-                     ((delete-run)                   (apply db:delete-run dbstruct params))
-                     ((lock/unlock-run)              (apply db:lock/unlock-run dbstruct params))
-                     ((update-run-event_time)        (apply db:update-run-event_time dbstruct params))
-                     ((update-run-stats)             (apply db:update-run-stats dbstruct params))
-                     ((set-var)                      (apply db:set-var dbstruct params))
-                     ((inc-var)                      (apply db:inc-var dbstruct params))
-		     ((dec-var)                      (apply db:dec-var dbstruct params))
-                     ((del-var)                      (apply db:del-var dbstruct params))
-		     ((add-var)                      (apply db:add-var dbstruct params))
-
-                     ;; STEPS
-                     ((teststep-set-status!)         (apply db:teststep-set-status! dbstruct params))
-                     ((delete-steps-for-test!)       (apply db:delete-steps-for-test! dbstruct params))
-                     
-                     ;; TEST DATA
-                     ((test-data-rollup)             (apply db:test-data-rollup dbstruct params))
-                     ((csv->test-data)               (apply db:csv->test-data dbstruct params))
-
-                     ;; MISC
-                     ((sync-inmem->db)               (let ((run-id (car params)))
-                                                       (db:sync-touched dbstruct run-id db:initialize-main-db force-sync: #t)))
-                     ((mark-incomplete)              (apply db:find-and-mark-incomplete dbstruct params))
-                     ((create-all-triggers)          (db:create-all-triggers dbstruct))
-                     ((drop-all-triggers)            (db:drop-all-triggers dbstruct)) 
-
-                     ;; TESTMETA
-                     ((testmeta-add-record)       (apply db:testmeta-add-record dbstruct params))
-                     ((testmeta-update-field)     (apply db:testmeta-update-field dbstruct params))
-                     ((get-tests-tags)            (db:get-tests-tags dbstruct))
-
-                     ;; TASKS
-                     ((tasks-add)                 (apply tasks:add dbstruct params))   
-                     ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params))
-                     ((tasks-get-last)            (apply tasks:get-last dbstruct params))
-
-		     ;; NO SYNC DB
-		     ((no-sync-set)               (apply db:no-sync-set         *no-sync-db* params))
-		     ((no-sync-get/default)       (apply db:no-sync-get/default *no-sync-db* params))
-		     ((no-sync-del!)              (apply db:no-sync-del!        *no-sync-db* params))
-		     ((no-sync-get-lock)          (apply db:no-sync-get-lock    *no-sync-db* params))
-		     
-                     ;; ARCHIVES
-                     ;; ((archive-get-allocations)   
-                     ((archive-register-disk)     (apply db:archive-register-disk dbstruct params))
-                     ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params))
-                     ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey))
-
-                     ;;======================================================================
-                     ;; READ ONLY QUERIES
-                     ;;======================================================================
-
-                     ;; KEYS
-                     ((get-key-val-pairs)               (apply db:get-key-val-pairs dbstruct params))
-                     ((get-keys)                        (db:get-keys dbstruct))
-                     ((get-key-vals)                    (apply db:get-key-vals dbstruct params))
-                     ((get-target)                      (apply db:get-target dbstruct params))
-                     ((get-targets)                     (db:get-targets dbstruct))
-
-                     ;; ARCHIVES
-                     ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
-                     
-                     ;; TESTS
-                     ((test-toplevel-num-items)         (apply db:test-toplevel-num-items dbstruct params))
-                     ((get-test-info-by-id)	       (apply db:get-test-info-by-id dbstruct params))
-                     ((test-get-rundir-from-test-id)    (apply db:test-get-rundir-from-test-id dbstruct params))
-                     ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params))
-                     ((get-count-tests-running)         (apply db:get-count-tests-running dbstruct params))
-                     ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params))
-                     ;; ((delete-test-step-records)        (apply db:delete-test-step-records dbstruct params))
-                     ;; ((get-previous-test-run-record)    (apply db:get-previous-test-run-record dbstruct params))
-                     ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params))
-                     ((test-get-logfile-info)           (apply db:test-get-logfile-info dbstruct params))
-                     ((test-get-records-for-index-file)  (apply db:test-get-records-for-index-file dbstruct params))
-                     ((get-testinfo-state-status)       (apply db:get-testinfo-state-status dbstruct params))
-                     ((test-get-top-process-pid)        (apply db:test-get-top-process-pid dbstruct params))
-                     ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params))
-                     ((get-prereqs-not-met)             (apply db:get-prereqs-not-met dbstruct params))
-                     ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params))
-                     ((get-not-completed-cnt)           (apply db:get-not-completed-cnt  dbstruct params)) 
-                     ;; ((synchash-get)                    (apply synchash:server-get dbstruct params))
-                     ((get-raw-run-stats)               (apply db:get-raw-run-stats dbstruct params))
-		     ((get-test-times)                  (apply db:get-test-times dbstruct params))
-
-                     ;; RUNS
-                     ((get-run-info)                 (apply db:get-run-info dbstruct params))
-                     ((get-run-status)               (apply db:get-run-status dbstruct params))
-                     ((get-run-state)                (apply db:get-run-state dbstruct params))
-                     ((set-run-status)               (apply db:set-run-status dbstruct params))
-                     ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
-                     ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db  dbstruct params)) 
-                     ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
-                     ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params))
-                     ((get-test-id)                  (apply db:get-test-id dbstruct params))
-                     ((get-tests-for-run-mindata)    (apply db:get-tests-for-run-mindata dbstruct params))
-                     ;; ((get-tests-for-runs-mindata)   (apply db:get-tests-for-runs-mindata dbstruct params))
-                     ((get-runs)                     (apply db:get-runs dbstruct params))
-                     ((simple-get-runs)              (apply db:simple-get-runs dbstruct params))
-                     ((get-num-runs)                 (apply db:get-num-runs dbstruct params))
-                     ((get-runs-cnt-by-patt)         (apply db:get-runs-cnt-by-patt dbstruct params))
-                     ((get-all-run-ids)              (db:get-all-run-ids dbstruct))
-                     ((get-prev-run-ids)             (apply db:get-prev-run-ids dbstruct params))
-                     ((get-run-ids-matching-target)  (apply db:get-run-ids-matching-target dbstruct params))
-                     ((get-runs-by-patt)             (apply db:get-runs-by-patt dbstruct params))
-                     ((get-run-name-from-id)         (apply db:get-run-name-from-id dbstruct params))
-                     ((get-main-run-stats)           (apply db:get-main-run-stats dbstruct params))
-                     ((get-var)                      (apply db:get-var dbstruct params))
-                     ((get-run-stats)                (apply db:get-run-stats dbstruct params))
-                     ((get-run-times)                (apply db:get-run-times dbstruct params)) 
-
-                     ;; STEPS
-                     ((get-steps-data)               (apply db:get-steps-data dbstruct params))
-                     ((get-steps-for-test)           (apply db:get-steps-for-test dbstruct params))
-		     ((get-steps-info-by-id)         (apply db:get-steps-info-by-id dbstruct params))
-
-                     ;; TEST DATA
-                     ((read-test-data)               (apply db:read-test-data dbstruct params))
-                     ((read-test-data-varpatt)       (apply db:read-test-data-varpatt dbstruct params))
-                     ((get-data-info-by-id)          (apply db:get-data-info-by-id dbstruct params)) 
-
-                     ;; MISC
-                     ((get-latest-host-load)         (apply db:get-latest-host-load dbstruct params))
-                     ((have-incompletes?)            (apply db:have-incompletes? dbstruct params))
-                     ((login)                        (apply db:login dbstruct params))
-                     ((general-call)                 (let ((stmtname   (car params))
-                                                           (run-id     (cadr params))
-                                                           (realparams (cddr params)))
-                                                       (db:general-call dbstruct run-id stmtname realparams)))
-                     ((sdb-qry)                      (apply sdb:qry params))
-                     ((ping)                         (current-process-id))
-		     ((get-changed-record-ids)       (apply db:get-changed-record-ids dbstruct params))
-		     ((get-run-record-ids) 	   (apply db:get-run-record-ids dbstruct params))	
-                     ;; TESTMETA
-                     ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))
-
-                     ;; TASKS 
-                     ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
-		     (else
-		      (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
-		      (conc "ERROR: BAD api call " cmd))))))
-	 (if (not readonly-command)
-	     (mutex-unlock! write-mutex))
-	 
-	 ;; save all stats
-	 (let ((delta-t (- (current-milliseconds)
-			   start-t))
-	       (modified-cmd (if (eq? cmd 'general-call)
-				 (string->symbol (conc "general-call-" (car params)))
-				 cmd)))
-	   (hash-table-set! *db-api-call-time* modified-cmd
-			    (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '()))))
-	 (if writecmd-in-readonly-mode
-             (begin
-               #;(common:telemetry-log (conc "api-out:"(->string cmd))
-               payload: `((params . ,params)
-               (ok-res . #t)))
-	       (vector #f res))
-             (begin
-               #;(common:telemetry-log (conc "api-out:"(->string cmd))
-               payload: `((params . ,params)
-               (ok-res . #f)))
-               (vector #t res))))))))
+(define *server-signature* #f)
+;; ;; These are called by the server on recipt of /api calls
+;; ;;    - keep it simple, only return the actual result of the call, i.e. no meta info here
+;; ;;
+;; ;;    - returns #( flag result )
+;; ;;
+;; (define (api:execute-requests dbstruct dat)
+;;   (if (> *api-process-request-count* 50)
+;;       (begin
+;; 	(if (common:low-noise-print 30 "too many threads")
+;; 	    (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay."))
+;; 	;; (thread-sleep! 0.5) ;; take a nap - no, the napping is moved to the clients via tt:backoff-incr
+;; 	))
+;;   (cond
+;;    ((not (vector? dat))                    ;; it is an error to not receive a vector
+;;     (vector #f (vector #f "remote must be called with a vector")))
+;;    (else  
+;;     (let* ((cmd-in            (vector-ref dat 0))
+;;            (cmd               (if (symbol? cmd-in)
+;; 				  cmd-in
+;; 				  (string->symbol cmd-in)))
+;;            (params            (vector-ref dat 1))
+;; 	   (run-id            (if (null? params)
+;; 				  0
+;; 				  (car params)))
+;; 	   (write-mutex       (if (hash-table-exists? *db-write-mutexes* run-id)
+;; 				  (hash-table-ref *db-write-mutexes* run-id)
+;; 				  (let* ((newmutex (make-mutex)))
+;; 				    (hash-table-set! *db-write-mutexes* run-id newmutex)
+;; 				    newmutex)))
+;;            (start-t           (current-milliseconds))
+;;            (readonly-mode     (dbr:dbstruct-read-only dbstruct))
+;;            (readonly-command  (member cmd api:read-only-queries))
+;;            (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))))
+;;       (if (not readonly-command)
+;; 	  (mutex-lock! write-mutex))
+;;       (let* ((tmppath    (dbr:dbstruct-tmppath  dbstruct))
+;; 	     (clean-run-id (cond
+;; 			    ((number? run-id)   run-id)
+;; 			    ((equal? run-id #f) "main")
+;; 			    (else               "other")))
+;; 	     (crumbfile  (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params)))
+;; 	     (res    
+;;               (if writecmd-in-readonly-mode
+;;                   (conc "attempt to run write command "cmd" on a read-only database")
+;; 		  (api:dispatch-request dbstruct cmd run-id params))))
+;; 	(delete-file* crumbfile)
+;; 	(if (not readonly-command)
+;; 	    (mutex-unlock! write-mutex))
+;; 	
+;; 	;; save all stats
+;; 	(let ((delta-t (- (current-milliseconds)
+;; 			  start-t))
+;; 	      (modified-cmd (if (eq? cmd 'general-call)
+;; 				(string->symbol (conc "general-call-" (car params)))
+;; 				cmd)))
+;; 	  (hash-table-set! *db-api-call-time* modified-cmd
+;; 			   (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '()))))
+;; 	(if writecmd-in-readonly-mode
+;;             (begin
+;;               #;(common:telemetry-log (conc "api-out:"(->string cmd))
+;;               payload: `((params . ,params)
+;;               (ok-res . #t)))
+;; 	      (vector #f res))
+;;             (begin
+;;               #;(common:telemetry-log (conc "api-out:"(->string cmd))
+;;               payload: `((params . ,params)
+;;               (ok-res . #f)))
+;;               (vector #t res))))))))
+
+;; indat is (cmd run-id params meta)
+;;
+;; WARNING: Do not print anything in the lambda of this function as it
+;;          reads/writes to current in/out port
+;;
+(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params)
+  (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
+  (if (not *server-signature*)
+      (set! *server-signature* (tt:mk-signature *toppath*)))
+  (lambda (indat)
+    (let* (;; (indat      (deserialize))
+	   (newcount   (+ *api-process-request-count* 1))
+	   (delay-wait (if (> newcount 10)
+			   (- newcount 10)
+			   0))
+	   (normal-proc (lambda (cmd run-id params)
+			  (case cmd
+			    ((ping) *server-signature*)
+			    (else
+			     (api:dispatch-request dbstruct cmd run-id params))))))
+      (set! *api-process-request-count* newcount)
+      (set! *db-last-access* (current-seconds))
+      (match indat
+	((cmd run-id params meta)
+	 (let* ((db-ok  (let* ((dbfname (dbmod:run-id->dbfname run-id))
+			       (ok      (equal? dbfname (dbr:dbstruct-dbfname dbstruct))))
+			  (case cmd
+			    ((ping) #t) ;; we are fine
+			    (else
+			     (if (not ok)(debug:print 0 *default-log-port* "ERROR: "cmd", run-id "run-id", not correct for dbfname "(dbr:dbstruct-dbfname dbstruct)))
+			     (assert ok "FATAL: database file and run-id not aligned.")))))
+		(ttdat   *server-info*)
+		(server-state (tt-state ttdat))
+		(status  (cond
+			  ((> newcount 5) 'busy)
+			  ;; ((> newcount 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down.
+			  (else 'ok)))
+		(errmsg  (case status
+			   ((busy)   (conc "Server overloaded, "newcount" threads in flight"))
+			   ((loaded) (conc "Server loaded, "newcount" threads in flight"))
+			   (else     #f)))
+		(result  (case status
+			   ((busy)
+			    (if (eq? cmd 'ping)
+				(normal-proc cmd run-id params)
+				;; newcount must be greater than 5 for busy
+				(- newcount 4) ;; was 15
+				)) ;; (- newcount 29)) ;; call back in as many seconds
+			   ((loaded)
+;; 			    (if (eq? (rmt:transport-mode) 'tcp)
+;; 				(thread-sleep! 0.5))
+			    (normal-proc cmd run-id params))
+			   (else
+			    (normal-proc cmd run-id params))))
+		(meta   (case cmd
+			  ((ping) `((sstate . ,server-state)))
+			  (else   `((wait . ,delay-wait)))))
+		(payload (list status errmsg result meta)))
+	   (set! *api-process-request-count* (- *api-process-request-count* 1))
+	   ;; (serialize payload)
+	   payload))
+	(else
+	 (assert #f "FATAL: failed to deserialize indat "indat))))))
+       
+
+(define (api:dispatch-request dbstruct cmd run-id params)
+  (if (not *no-sync-db*)
+      (db:open-no-sync-db))
+  (case cmd
+    ;;===============================================
+    ;; READ/WRITE QUERIES
+    ;;===============================================
+
+    ((get-keys-write)                        (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl
+    
+    ;; SERVERS
+    ((start-server)                    (apply server:kind-run params))
+    ((kill-server)                     (set! *server-run* #f))
+
+    ;; TESTS
+
+    ;;((test-set-state-status-by-id)     (apply mt:test-set-state-status-by-id dbstruct params))
+    ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items.
+    ((test-set-state-status-by-id)
+
+     ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
+     (db:set-state-status-and-roll-up-items
+      dbstruct
+      (list-ref params 0) ; run-id
+      (list-ref params 1) ; test-name
+      #f                  ; item-path
+      (list-ref params 2) ; state
+      (list-ref params 3) ; status
+      (list-ref params 4) ; comment
+      ))
+    
+    ((delete-test-records)             (apply db:delete-test-records dbstruct params))
+    ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params))
+    ((test-set-state-status)           (apply db:test-set-state-status dbstruct params))
+    ((test-set-top-process-pid)        (apply db:test-set-top-process-pid dbstruct params))
+    ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params))
+    ((set-state-status-and-roll-up-run) (apply db:set-state-status-and-roll-up-run dbstruct params)) 
+    ((top-test-set-per-pf-counts)       (apply db:top-test-set-per-pf-counts dbstruct params))
+    ((test-set-archive-block-id)        (apply db:test-set-archive-block-id dbstruct params))
+
+    ((insert-test)                      (db:insert-test dbstruct run-id params))
+
+    ;; RUNS
+    ((register-run)                 (apply db:register-run dbstruct params))
+    ((set-tests-state-status)       (apply db:set-tests-state-status dbstruct params))
+    ((delete-run)                   (apply db:delete-run dbstruct params))
+    ((lock/unlock-run)              (apply db:lock/unlock-run dbstruct params))
+    ((update-run-event_time)        (apply db:update-run-event_time dbstruct params))
+    ((update-run-stats)             (apply db:update-run-stats dbstruct params))
+    ((set-var)                      (apply db:set-var dbstruct params))
+    ((inc-var)                      (apply db:inc-var dbstruct params))
+    ((dec-var)                      (apply db:dec-var dbstruct params))
+    ((del-var)                      (apply db:del-var dbstruct params))
+    ((add-var)                      (apply db:add-var dbstruct params))
+
+    ((insert-run)                   (apply db:insert-run dbstruct params))
+
+    ;; STEPS
+    ((teststep-set-status!)         (apply db:teststep-set-status! dbstruct params))
+    ((delete-steps-for-test!)       (apply db:delete-steps-for-test! dbstruct params))
+    
+    ;; TEST DATA
+    ((test-data-rollup)             (apply db:test-data-rollup dbstruct params))
+    ((csv->test-data)               (apply db:csv->test-data dbstruct params))
+
+    ;; MISC
+    ((sync-cachedb->db)               (let ((run-id (car params)))
+                                      (db:sync-touched dbstruct run-id db:initialize-main-db force-sync: #t)))
+    ((get-toplevels-and-incompletes) (apply db:get-toplevels-and-incompletes dbstruct params))
+    ((mark-incomplete)              #f);;(thread-start! (make-thread (lambda () ;; no need to block on this one
+					;;			  (apply db:find-and-mark-incomplete dbstruct params)
+					;;			  #t)))) 
+    ((create-all-triggers)          (db:create-all-triggers dbstruct))
+    ((drop-all-triggers)            (db:drop-all-triggers dbstruct)) 
+
+    ;; TESTMETA
+    ((testmeta-add-record)       (apply db:testmeta-add-record dbstruct params))
+    ((testmeta-update-field)     (apply db:testmeta-update-field dbstruct params))
+    ((get-tests-tags)            (db:get-tests-tags dbstruct))
+
+    ;; TASKS
+    ((tasks-add)                 (apply tasks:add dbstruct params))   
+    ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params))
+    ((tasks-get-last)            (apply tasks:get-last dbstruct params))
+
+    ;; NO SYNC DB
+    ((no-sync-set)               (apply db:no-sync-set         *no-sync-db* params))
+    ((no-sync-get/default)       (apply db:no-sync-get/default *no-sync-db* params))
+    ((no-sync-del!)              (apply db:no-sync-del!        *no-sync-db* params))
+    ((no-sync-get-lock)          (apply db:no-sync-get-lock    *no-sync-db* params))
+    
+    ;; ARCHIVES
+    ;; ((archive-get-allocations)   
+    ((archive-register-disk)     (apply db:archive-register-disk dbstruct params))
+    ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params))
+    ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey))
+
+    ;;======================================================================
+    ;; READ ONLY QUERIES
+    ;;======================================================================
+
+    ;; KEYS
+    ((get-key-val-pairs)               (apply db:get-key-val-pairs dbstruct params))
+    ((get-keys)                        (db:get-keys dbstruct))
+    ((get-key-vals)                    (apply db:get-key-vals dbstruct params))
+    ((get-target)                      (apply db:get-target dbstruct params))
+    ((get-targets)                     (db:get-targets dbstruct))
+
+    ;; ARCHIVES
+    ((test-get-archive-block-info)     (apply db:test-get-archive-block-info dbstruct params))
+    
+    ;; TESTS
+    ((test-toplevel-num-items)         (apply db:test-toplevel-num-items dbstruct params))
+    ((get-test-info-by-id)	       (apply db:get-test-info-by-id dbstruct params))
+    ((get-test-state-status-by-id)     (apply db:get-test-state-status-by-id dbstruct params))
+    ((test-get-rundir-from-test-id)    (apply db:test-get-rundir-from-test-id dbstruct params))
+    ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params))
+    ((get-count-tests-running)         (apply db:get-count-tests-running dbstruct params))
+    ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params))
+    ;; ((delete-test-step-records)        (apply db:delete-test-step-records dbstruct params))
+    ;; ((get-previous-test-run-record)    (apply db:get-previous-test-run-record dbstruct params))
+    ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params))
+    ((test-get-logfile-info)           (apply db:test-get-logfile-info dbstruct params))
+    ((test-get-records-for-index-file)  (apply db:test-get-records-for-index-file dbstruct params))
+    ((get-testinfo-state-status)       (apply db:get-testinfo-state-status dbstruct params))
+    ((test-get-top-process-pid)        (apply db:test-get-top-process-pid dbstruct params))
+    ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params))
+    ((get-prereqs-not-met)             (apply db:get-prereqs-not-met dbstruct params))
+    ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params))
+    ((get-not-completed-cnt)           (apply db:get-not-completed-cnt  dbstruct params)) 
+    ((get-raw-run-stats)               (apply db:get-raw-run-stats dbstruct params))
+    ((get-test-times)                  (apply db:get-test-times dbstruct params))
+
+    ;; RUNS
+    ((get-run-info)                 (apply db:get-run-info dbstruct params))
+    ((get-run-status)               (apply db:get-run-status dbstruct params))
+    ((get-run-state)                (apply db:get-run-state dbstruct params))
+    ((get-run-state-status)         (apply db:get-run-state-status dbstruct params))
+    ((set-run-status)               (apply db:set-run-status dbstruct params))
+    ((set-run-state-status)  			 (apply db:set-run-state-status dbstruct params))
+    ((update-tesdata-on-repilcate-db) (apply db:update-tesdata-on-repilcate-db  dbstruct params)) 
+    ((get-tests-for-run)            (apply db:get-tests-for-run dbstruct params))
+    ((get-tests-for-run-state-status) (apply db:get-tests-for-run-state-status dbstruct params))
+    ((get-test-id)                  (apply db:get-test-id dbstruct params))
+    ((get-tests-for-run-mindata)    (apply db:get-tests-for-run-mindata dbstruct params))
+    ;; ((get-tests-for-runs-mindata)   (apply db:get-tests-for-runs-mindata dbstruct params))
+    ((get-runs)                     (apply db:get-runs dbstruct params))
+    ((simple-get-runs)              (apply db:simple-get-runs dbstruct params))
+    ((get-num-runs)                 (apply db:get-num-runs dbstruct params))
+    ((get-runs-cnt-by-patt)         (apply db:get-runs-cnt-by-patt dbstruct params))
+    ((get-all-run-ids)              (db:get-all-run-ids dbstruct))
+    ((get-prev-run-ids)             (apply db:get-prev-run-ids dbstruct params))
+    ((get-run-ids-matching-target)  (apply db:get-run-ids-matching-target dbstruct params))
+    ((get-runs-by-patt)             (apply db:get-runs-by-patt dbstruct params))
+    ((get-run-name-from-id)         (apply db:get-run-name-from-id dbstruct params))
+    ((get-main-run-stats)           (apply db:get-main-run-stats dbstruct params))
+    ((get-var)                      (apply db:get-var dbstruct params))
+    ((get-run-stats)                (apply db:get-run-stats dbstruct params))
+    ((get-run-times)                (apply db:get-run-times dbstruct params)) 
+
+    ;; STEPS
+    ((get-steps-data)               (apply db:get-steps-data dbstruct params))
+    ((get-steps-for-test)           (apply db:get-steps-for-test dbstruct params))
+    ((get-steps-info-by-id)         (apply db:get-steps-info-by-id dbstruct params))
+
+    ;; TEST DATA
+    ((read-test-data)               (apply db:read-test-data dbstruct params))
+    ((read-test-data-varpatt)       (apply db:read-test-data-varpatt dbstruct params))
+    ((get-data-info-by-id)          (apply db:get-data-info-by-id dbstruct params)) 
+
+    ;; MISC
+    ((get-latest-host-load)         (apply db:get-latest-host-load dbstruct params))
+    ((have-incompletes?)            (apply db:have-incompletes? dbstruct params))
+    ((login)                        (apply db:login dbstruct params))
+    ((general-call)                 (let ((stmtname   (car params))
+                                          (run-id     (cadr params))
+                                          (realparams (cddr params)))
+                                      (db:general-call dbstruct run-id stmtname realparams)))
+    ((sdb-qry)                      (apply sdb:qry params))
+    ((ping)                         (current-process-id))
+    ((get-changed-record-ids)       (apply db:get-changed-record-ids dbstruct params))
+    ((get-run-record-ids) 	   (apply db:get-run-record-ids dbstruct params))	
+    ;; TESTMETA
+    ((testmeta-get-record)       (apply db:testmeta-get-record dbstruct params))
+
+    ;; TASKS 
+    ((find-task-queue-records)   (apply tasks:find-task-queue-records dbstruct params))
+    (else
+     (debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
+     (conc "ERROR: BAD api call " cmd))))
 
 ;; http-server  send-response
 ;;                 api:process-request
 ;;                    db:*
 ;;

Index: apimod.scm
==================================================================
--- apimod.scm
+++ apimod.scm
@@ -18,17 +18,15 @@
 
 ;;======================================================================
 
 (declare (unit apimod))
 (declare (uses commonmod))
-(declare (uses ulex))
 
 (module apimod
 	*
 	
 (import scheme chicken data-structures extras)
 (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18)
 (import commonmod)
-(import (prefix ulex ulex:))
 
 
 )

Index: archive.scm
==================================================================
--- archive.scm
+++ archive.scm
@@ -16,15 +16,25 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 
 ;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')
 
-(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 format md5 message-digest srfi-18)
-
 (declare (unit archive))
 (declare (uses db))
+(declare (uses debugprint))
+(declare (uses mtargs))
 (declare (uses common))
+(declare (uses commonmod))
+(declare (uses rmtmod))
+
+(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69
+     format md5 message-digest srfi-18)
+
+(import commonmod
+	debugprint
+	rmtmod
+	(prefix mtargs args:))
 
 (include "common_records.scm")
 (include "db_records.scm")
 
 ;;======================================================================

ADDED   artifacts.scm
Index: artifacts.scm
==================================================================
--- /dev/null
+++ artifacts.scm
@@ -0,0 +1,24 @@
+;;======================================================================
+;; Copyright 2019, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+;;======================================================================
+
+(declare (unit artifacts))
+
+(include "artifacts/artifacts.scm")
+

ADDED   artifacts/README
Index: artifacts/README
==================================================================
--- /dev/null
+++ artifacts/README
@@ -0,0 +1,1 @@
+NOTE: keep megatest/artifacts/ in sync with datastore/artifacts

ADDED   artifacts/artifacts.meta
Index: artifacts/artifacts.meta
==================================================================
--- /dev/null
+++ artifacts/artifacts.meta
@@ -0,0 +1,21 @@
+;; -*- scheme -*-
+(
+; Your egg's license:
+(license "BSD")
+
+; Pick one from the list of categories (see below) for your egg and enter it
+; here.
+(category db)
+
+; A list of eggs pkts depends on.  If none, you can omit this declaration
+; altogether. If you are making an egg for chicken 3 and you need to use
+; procedures from the `files' unit, be sure to include the `files' egg in the
+; `needs' section (chicken versions < 3.4.0 don't provide the `files' unit).
+; `depends' is an alias to `needs'.
+;; (needs (autoload "3.0"))
+
+; A list of eggs required for TESTING ONLY.  See the `Tests' section.
+(test-depends test)
+
+(author "Matt Welland")
+(synopsis "A sha1-chain based datastore similar to the data format in fossil scm, consisting of artifacts of single line cards."))

ADDED   artifacts/artifacts.release-info
Index: artifacts/artifacts.release-info
==================================================================
--- /dev/null
+++ artifacts/artifacts.release-info
@@ -0,0 +1,3 @@
+(repo fossil "http://www.kiatoa.com/cgi-bin/fossils/{egg-name}")
+(uri zip "http://www.kiatoa.com/cgi-bin/fossils/{egg-name}/zip/{egg-name}.zip?uuid={egg-release}")
+(release "1.0")

ADDED   artifacts/artifacts.scm
Index: artifacts/artifacts.scm
==================================================================
--- /dev/null
+++ artifacts/artifacts.scm
@@ -0,0 +1,1624 @@
+;; Copyright 2006-2017, Matthew Welland.
+;; 
+;; This file is part of artifacts
+;; 
+;;     Pkts is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Pkts is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Pkts.  If not, see <http://www.gnu.org/licenses/>.
+;;
+
+;; CARDS:
+;;
+;; A card is a line of text, the first two characters are a letter followed by a
+;;   space. The letter is the card type.
+;;
+;; artifact:
+;;
+;; An artifact is a sorted list of cards with a final card Z that contains the shar1 hash
+;;   of all of the preceding cards.
+;;
+;; AARTIFACT:
+;;
+;;  An alist mapping card types to card data
+;;      '((T . "artifacttype")
+;;        (a . "some content"))
+;;
+;; EARTIFACT:
+;;
+;;  Extended packet using friendly keys. Must use a artifactspec to convert to/from eartifacts
+;;    '((ptype . "artifacttype")
+;;      (adata . "some content))
+;;
+;; DARTIFACT:
+;;
+;; artifacts pulled from the database have this format:
+;;
+;;((aartifact (Z . "7de89c09ac024b3832c93e16cd78d11e2e28733b")     <= this is a the alist
+;;       (t . "v1.63/tip/dev")
+;;       (c . "QUICKPATT")
+;;       (T . "runstart")
+;;       (P . "354eeb67120a921e3e33155ecab1b522ac10b6bd")
+;;       (D . "1488995096.0"))
+;;  (id . 8)
+;;  (group-id . 0)
+;;  (uuid . "7de89c09ac024b3832c93e16cd78d11e2e28733b")
+;;  (parent . "")
+;;  (artifact-type . "runstart")
+;;  (artifact . "D 1488995096.0\nP 354eeb67120a921e3e33155ecab1b522ac10b6bd\nT runstart\nc QUICKPATT\nt v1.63/tip/dev\nZ 7de89c09ac024b3832c93e16cd78d11e2e28733b"))
+;;
+;; artifactspec is alist of alists mapping types and nicekeys to keys
+;;
+;; '((posting . ((title . t)
+;;               (url   . u)
+;;               (blurb . b)))
+;;   (comment . ((comment . c)
+;;               (score   . s))))
+
+;; Reserved cards:
+;;   P      : artifact parent
+;;   R      : reference artifact containing mapping of short string -> sha1sum strings
+;;   T      : artifact type
+;;   D      : current time from (current-time), unless provided
+;;   Z      : shar1 hash of the packet
+
+;; Example usage:
+;;
+;; Create a artifact:
+;;
+;; (use artifacts)
+;; (define-values (uuid artifact)
+;;     (alist->artifact
+;;       '((fruit . "apple") (meat . "beef"))  ;; this is the data to convert
+;;       '((foods (fruit . f) (meat . m)))     ;; this is the artifact spec
+;;       ptype:
+;;       'foods))
+;;
+;; Add to artifact queue:
+;;
+;; (define db (open-queue-db "/tmp/artifacts" "artifacts.db"))
+;; (add-to-queue db artifact uuid 'foods #f 0) ;; no parent and use group_id of 0
+;;
+;; Retrieve the packet from the db and extract a value:
+;;
+;; (alist-ref
+;;    'meat
+;;    (dartifact->alist
+;;         (car (get-dartifacts db #f 0 #f))
+;;        '((foods (fruit . f)
+;;                 (meat . m)))))
+;; => "beef"
+;;
+
+(module artifacts
+(
+;; cards, util and misc
+;; sort-cards
+;; calc-sha1
+;;
+;; low-level constructor procs, exposed only for development/testing, will be removed
+construct-sdat
+construct-artifact     
+card->type/value  
+add-z-card
+
+;; queue database procs
+open-queue-db
+add-to-queue
+create-and-queue
+;; lookup-by-uuid
+lookup-by-id
+get-dartifacts
+get-not-processed-artifacts
+get-related
+find-artifacts
+process-artifacts
+get-descendents
+get-ancestors
+get-artifacts
+;; get-last-descendent
+;; with-queue-db
+;; load-artifacts-to-db
+
+;; procs that operate directly on artifacts, sdat, aartifacts, dartifacts etc.
+artifact->alist    ;; artifact -> aartifact (i.e. alist)
+artifact->sdat     ;; artifact -> '("a aval" "b bval" ...)
+sdat->alist   ;; '("a aval" "b bval"...) -> ((a . "aval")(b . "bval") ...)
+dblst->dartifacts  ;; convert list of tuples from queue db into dartifacts
+dartifact->alist   ;; flatten a dartifact into an alist containing all db fields and the artifact alist
+dartifacts->alists ;; apply dartifact->alist to a list of alists using a artifact-spec
+alist->artifact    ;; returns two values uuid, artifact
+get-value     ;; looks up a value given a key in a dartifact
+flatten-all   ;; merge the list of values from a query which includes a artifact into a flat alist <== really useful!
+check-artifact
+
+;; artifact alists
+write-alist->artifact
+read-artifact->alist
+
+;; archive database
+;; archive-open-db
+;; write-archive-artifacts
+;; archive-artifacts
+;; mark-processed
+
+;; artifactsdb
+artifactdb-conn     ;; useful
+artifactdb-fname
+artifactsdb-open
+artifactsdb-close
+artifactsdb-add-record
+;; temporary
+artifactdb-artifactspec
+
+;; utility procs
+increment-string ;; used to get indexes for strings in ref artifacts
+make-report      ;; make a .dot file
+calc-sha1
+uuid-first-two-letters
+uuid-remaining-letters
+
+;; file and directory utils
+multi-glob
+capture-dir
+file-get-sha1
+check-same
+link-or-copy
+same-partition?
+link-if-same-partition
+archive-copy
+write-to-archive
+artifact-rollup
+read-artifacts-into-hash
+hash-of-artifacts->bundle
+archive-dest
+
+;; pathname-full-filename
+
+;; minimal artifact functions
+minimal-artifact-read
+minimal-artifact->alist
+afact-get-D
+afact-get-Z
+afact-get-T
+afact-get
+afact-get-number/default
+
+
+;; bundles
+write-bundle
+read-bundle
+
+;; new artifacts db
+with-todays-adb
+get-all-artifacts
+refresh-artifacts-db
+
+)
+
+(import (chicken base) scheme (chicken process) (chicken time posix)
+	(chicken io) (chicken file) (chicken pathname)
+        chicken.process-context.posix (chicken string)
+	(chicken time) (chicken sort) (chicken file posix) (chicken condition) srfi-1
+	regex srfi-13 srfi-69 (chicken port) (chicken process-context)
+	crypt sha1 matchable message-digest sqlite3 typed-records
+	directory-utils
+	scsh-process)
+
+;;======================================================================
+;; DATA MANIPULATION UTILS
+;;======================================================================
+
+(define-inline (unescape-data data)
+  (string-translate* data '(("\\n" . "\n") ("\\\\" . "\\"))))
+
+(define-inline (escape-data data)
+  (string-translate* data '(("\n" . "\\n") ("\\" . "\\\\"))))
+
+(define-inline (make-card type data)
+  (conc type " " (escape-data (->string data))))
+
+;; reverse an alist for doing artifactkey -> external key conversions
+;;
+(define-inline (reverse-aspec aspec)
+  (map (lambda (dat)
+	 (cons (cdr dat)(car dat)))
+       aspec))
+
+;; add a card to the list of cards, sdat
+;; if type is #f return only sdat
+;; if data is #f return only sdat
+;;
+(define-inline (add-card sdat type data)
+  (if (and type data)
+      (cons (make-card type data) sdat)
+      sdat))
+
+;;======================================================================
+;; STRING AS FUNKY NUMBER
+;;======================================================================
+
+;; NOTE: PTDZ are removed as they are reserved. NB// the R card is not used in a
+;;       ref, instead the P parent card is used.
+;;       Question: Why does it matter to remove PTDZ?
+;;                 To make the ref easier to use the ref strings will be the keys
+;;                 so we cannot have overlap with any actual keys. But this is a
+;;                 bit silly. What we need to do instead is reject keys of length
+;;                 one where the char is in PTDZ
+;;
+;; This is basically base92
+;;
+(define string-num-chars (string->list "!#$%&'()*+,-./0123456789:;<=>?@ABCEFGHIJKLMNOQRSUVWXY[\\]^_abcdefghijklmnopqrstuvwxyz{|}~"))
+;; "0123456789abcdefghijklmnopqrstuvwxyzABCEFGHIJKLMNOQSUVWXY!#$%&'()*+,-./[]:;<=>?\\^_{}|"))
+
+(define (char-incr inchar)
+  (let* ((carry     #f)
+	 (next-char (let ((rem (member inchar string-num-chars)))
+		      (if (eq? (length rem) 1) ;; we are at the last character in our string-num-chars list
+			  (begin
+			    (set! carry #t)
+			    (car string-num-chars))
+			  (cadr rem)))))
+    (values next-char carry)))
+    
+(define (increment-string str)
+  (if (string-null? str)
+      "0"
+      (let ((strlst (reverse (string->list str)))) ;; need to process the string from the lsd
+	(list->string
+	 (let loop ((hed (car strlst))
+		    (tal (cdr strlst))
+		    (res '()))
+	   (let-values (((newhed carry)(char-incr hed)))
+	     ;; (print "newhed: " newhed " carry: " carry " tal: " tal)
+	     (let ((newres (cons newhed res)))
+	       (if carry ;; we'll have to propagate the carry
+		   (if (null? tal) ;; at the end, tack on "0" (which is really a "1")
+		       (cons (car string-num-chars) newres)
+		       (loop (car tal)(cdr tal) newres))
+		   (append (reverse tal) newres)))))))))
+    
+;;======================================================================
+;; P K T S D B   I N T E R F A C E
+;;
+;; INTEGER, REAL, TEXT
+;;======================================================================
+;;
+;; spec
+;;  ( (tablename1 . (field1name L1 TYPE)
+;;                  (field2name L2 TYPE) ... )
+;;    (tablename2 ... ))
+;;
+;;  Example: (tests (testname n TEXT)
+;;                  (rundir   r TEXT)
+;;                   ... )
+;;
+;; artifact keys are taken from the first letter, if that is not unique
+;; then look at the next letter and so on
+;;
+
+;; simplify frequent need to get one result with default
+;;
+(define (get-one db default qry . params)
+  (apply fold-row
+   car
+   default
+   db
+   qry
+   params))
+
+(define (get-rows db qry . params)
+  (apply fold-row
+   cons
+   db
+   qry
+   params))
+  
+;; use this struct to hold the artifactspec and the db handle
+;;
+(defstruct artifactdb
+  (fname       #f)
+  (artifactsdb-spec #f)
+  (artifactspec     #f)  ;; cache the artifactspec
+  (field-keys  #f)  ;; cache the field->key mapping (field1 . k1) ...
+  (key-fields  #f)  ;; cache the key->field mapping
+  (conn        #f)
+  )
+
+;; WARNING: There is a simplification in the artifactsdb spec w.r.t. artifactspec.
+;;          The field specs are the cdr of the table list - not a full
+;;          list. The extra list level in artifactspec is gratuitous and should
+;;          be removed.
+;;
+(define (artifactsdb-spec->artifactspec tables-spec)
+  (map (lambda (tablespec)
+	 (list (car tablespec)
+	       (map (lambda (field-spec)
+		      (cons (car field-spec)(cadr field-spec)))
+		    (cdr tablespec))))
+       tables-spec))
+
+(define (artifactsdb-open dbfname artifactsdb-spec)
+  (let* ((pdb      (make-artifactdb))
+	 (dbexists (file-exists? dbfname))
+	 (db       (open-database dbfname)))
+    (artifactdb-artifactsdb-spec-set! pdb artifactsdb-spec)
+    (artifactdb-artifactspec-set!     pdb (artifactsdb-spec->artifactspec artifactsdb-spec))
+    (artifactdb-fname-set!       pdb dbfname)
+    (artifactdb-conn-set!        pdb db)
+    (if (not dbexists)
+	(artifactsdb-init pdb))
+    pdb))
+
+(define (artifactsdb-init artifactsdb)
+  (let* ((db          (artifactdb-conn artifactsdb))
+	 (artifactsdb-spec (artifactdb-artifactsdb-spec artifactsdb)))
+    ;; create a table for the artifacts themselves
+    (execute db "CREATE TABLE IF NOT EXISTS artifacts (id INTEGER PRIMARY KEY, zkey TEXT, record_id INTEGER, artifact TEXT);")
+    (for-each
+     (lambda (table)
+       (let* ((table-name (car table))
+	      (fields     (cdr table))
+	      (stmt (conc "CREATE TABLE IF NOT EXISTS "
+			  table-name
+			  " (id INTEGER PRIMARY KEY,"
+			  (string-intersperse
+			   (map (lambda (fieldspec)
+				  (conc (car fieldspec) " "
+					(caddr fieldspec)))
+				fields)
+			   ",")
+			  ");")))
+	 (execute db stmt)))
+     artifactsdb-spec)))
+
+;; create artifact from the data and insert into artifacts table
+;; 
+;; data is assoc list of (field . value) ...
+;; tablename is a symbol matching the table name
+;;
+(define (artifactsdb-add-record artifactsdb tablename data #!optional (parent #f))
+  (let*-values (((zkey artifact) (alist->artifact data (artifactdb-artifactspec artifactsdb) ptype: tablename)))
+    ;; have the data as alist so insert it into appropriate table also
+    (let* ((db        (artifactdb-conn artifactsdb)))
+      ;; TODO: Address collisions
+      (execute db "INSERT INTO artifacts (zkey,artifact,record_id) VALUES (?,?,?);"
+		zkey artifact -1)
+      (let* (;; (artifactid     (artifactsdb-artifactkey->artifactid artifactsdb artifactkey))
+	     (record-id (artifactsdb-insert artifactsdb tablename data)))
+	(execute db "UPDATE artifacts SET record_id=? WHERE zkey=?;"
+		  record-id zkey)
+      ))))
+
+;; 
+(define (artifactsdb-insert artifactsdb tablename data)
+  (let* ((db (artifactdb-conn artifactsdb))
+	 (stmt (conc "INSERT INTO " tablename
+		     " (" (string-intersperse (map conc (map car data)) ",")
+		     ") VALUES ('"
+		     ;; TODO: Add lookup of data type and do not
+		     ;;       wrap integers with quotes
+		     (string-intersperse (map conc (map cdr data)) "','")
+		     "');")))
+    (print "stmt: " stmt)
+    (execute db stmt)
+    ;; lookup the record-id and return it
+    
+    ))
+    
+(define (artifactsdb-close artifactsdb)
+  (finalize! (artifactdb-conn artifactsdb)))
+
+;; (let loop ((s "0")(n 0))(print s)(if (< n 5000)(loop (increment-string s)(+ n 1))))
+
+;;======================================================================
+;; CARDS, MISC and UTIL
+;;======================================================================
+
+;; given string (likely multi-line) "dat" return shar1 hash
+;;
+(define (calc-sha1 instr)
+  (message-digest-string
+   (sha1-primitive)
+   instr))
+
+;; given a single card return its type and value
+;;
+(define (card->type/value card)
+  (let ((ctype (substring card 0 1))
+	(cval  (substring card 2 (string-length card))))
+    (values (string->symbol ctype) cval)))
+
+;;======================================================================
+;; SDAT procs
+;;  sdat is legacy/internal usage. Intention is to remove sdat calls from
+;;  the exposed calls.
+;;======================================================================
+
+;; sort list of cards
+;;
+(define-inline (sort-cards sdat)
+  (sort sdat string<=?))
+
+;; artifact rules
+;; 1. one card per line
+;; 2. at least one card
+;; 3. no blank lines
+
+;; given sdat, a list of cards return uuid, packet (as sdat)
+;;
+(define (add-z-card sdat)
+  (let* ((sorted-sdat (sort-cards sdat))
+	 (dat         (string-intersperse sorted-sdat "\n"))
+	 (uuid        (calc-sha1 dat)))
+    (values
+     uuid
+     (conc
+      dat
+      "\nZ "
+      uuid))))
+
+(define (check-artifact artifact)
+  (handle-exceptions
+      exn
+      #f ;; anything goes wrong - call it a crappy artifact
+    (let* ((sdat (string-split artifact "\n"))
+	   (rdat (reverse sdat)) ;; reversed
+	   (zdat (car rdat))
+	   (Z    (cadr (string-split zdat)))
+	   (cdat (string-intersperse (reverse (cdr rdat)) "\n")))
+      (equal? Z (calc-sha1 cdat)))))
+
+;;======================================================================
+;; AARTIFACTs
+;;======================================================================
+
+;; convert a sdat (list of cards) to an alist
+;;
+(define (sdat->alist sdat)
+  (let loop ((hed (car sdat))
+	     (tal (cdr sdat))
+	     (res '()))
+    (let-values (( (ctype cval)(card->type/value hed) ))
+      ;; if this card is not one of the common ones tack it on to rem
+      (let* ((oldval (alist-ref ctype res))
+	     (newres (cons (cons ctype
+				 (if oldval ;; list or string
+				     (if (list? oldval)
+					 (cons cval oldval)
+					 (cons cval (list oldval)))
+				     cval))
+			   res)))
+	(if (null? tal)
+	    newres
+	    (loop (car tal)(cdr tal) newres))))))
+
+;;((aartifact (Z . "7de89c09ac024b3832c93e16cd78d11e2e28733b")     <= this is a the alist
+;;       (t . "v1.63/tip/dev")
+;;       (c . "QUICKPATT")
+;;       (T . "runstart")
+;;       (P . "354eeb67120a921e3e33155ecab1b522ac10b6bd")
+;;       (D . "1488995096.0"))
+;;  (id . 8)
+;;  (group-id . 0)
+;;  (uuid . "7de89c09ac024b3832c93e16cd78d11e2e28733b")
+;;  (parent . "")
+;;  (artifact-type . "runstart")
+;;  (artifact . "D 1488995096.0\nP 354eeb67120a921e3e33155ecab1b522ac10b6bd\nT runstart\nc QUICKPATT\nt v1.63/tip/dev\nZ 7de89c09ac024b3832c93e16cd78d11e2e28733b"))
+;;
+;; artifactspec is alist of alists mapping types and nicekeys to keys
+;;
+;; '((posting . ((title . t)
+;;               (url   . u)
+;;               (blurb . b)))
+;;   (comment . ((comment . c)
+;;               (score   . s))))
+
+;; DON'T USE? 
+;;
+(define (get-value field dartifact . spec-in)
+  (if (null? spec-in)
+      (alist-ref field dartifact)
+      (let* ((spec  (car spec-in))
+	     (aartifact  (alist-ref 'aartifact dartifact))) ;; get the artifact alist
+	(if (and aartifact spec)
+	    (let* ((ptype (alist-ref 'artifact-type dartifact))
+		   (pspec (alist-ref (string->symbol ptype) spec))) ;; do we have a spec for this type of artifact
+	      (and pspec
+		  (let* ((key (alist-ref field pspec)))
+		    (and key (alist-ref key aartifact)))))
+	    #f))))
+
+;; convert a dartifact to a pure alist given a artifactspec
+;; this flattens out the alist to include the data from
+;; the queue database record
+;;
+(define (dartifact->alist dartifact artifactspec)
+  (let* ((aartifact       (alist-ref 'aartifact dartifact))
+	 (artifact-type   (or (alist-ref 'artifact-type dartifact) ;; artifact-type is from the database field artifact_type
+			 (alist-ref 'T aartifact)))
+	 (artifact-fields (alist-ref (string->symbol artifact-type) artifactspec))
+	 (rev-fields (if artifact-fields
+			 (reverse-aspec artifact-fields)
+			 '())))
+    (append (map (lambda (entry)
+		   (let* ((artifact-key (car entry))
+			  (new-key (or (alist-ref artifact-key rev-fields) artifact-key)))
+		     `(,new-key . ,(cdr entry))))
+		 aartifact)
+	    dartifact)))
+
+;; convert a list of dartifacts into a list of alists using artifact-spec
+;;
+(define (dartifacts->alists dartifacts artifact-spec)
+   (map (lambda (x)
+	  (dartifact->alist x artifact-spec))
+	dartifacts))
+
+;; Generic flattener, make the tuple and artifact into a single flat alist
+;;
+;; qry-result-spec is a list of symbols corresponding to each field
+;;
+(define (flatten-all inlst artifactspec . qry-result-spec)
+  (map
+   (lambda (tuple)
+     (dartifact->alist
+      (apply dblst->dartifacts tuple qry-result-spec)
+      artifactspec))
+   inlst))
+
+;; call like this:
+;;  (construct-sdat 'a "a data" 'S "S data" ...)
+;; returns list of cards
+;;  ( "A a value" "D 12345678900" ...)
+;;
+(define (construct-sdat . alldat)
+  (let ((have-D-card #f)) ;; flag
+    (if (even? (length alldat))
+	(let loop ((type (car alldat))
+		   (data (cadr alldat))
+		   (tail (cddr alldat))
+		   (res  '()))
+	  (if (eq? type 'D)(set! have-D-card #t))
+	  (if (null? tail)
+	      (if have-D-card ;; return the constructed artifact, add a D card if none found
+		  (add-card res type data)
+		  (add-card 
+		   (add-card res 'D (current-seconds))
+		   type data))
+	      (loop (car tail)
+		    (cadr tail)
+		    (cddr tail)
+		    (add-card res type data))))
+	#f))) ;; #f means it failed to create the sdat
+
+(define (construct-artifact . alldat)
+  (add-z-card
+   (apply construct-sdat alldat)))
+
+;;======================================================================
+;; CONVERTERS
+;;======================================================================
+
+(define (artifact->sdat artifact)
+  (map unescape-data (string-split artifact "\n")))
+
+;; given a pure artifact return an alist
+;;
+(define (artifact->alist artifact #!key (artifactspec #f))
+  (let ((sdat (cond
+	       ((string? artifact)  (artifact->sdat artifact))
+	       ((list? artifact)    artifact)
+	       (else #f))))
+    (if artifact
+	(if artifactspec
+	    (dartifact->alist (list (cons 'aartifact (sdat->alist sdat))) artifactspec)
+	    (sdat->alist sdat))
+	#f)))
+
+;; convert an alist to an sdat
+;;  in: '((a . "blah")(b . "foo"))
+;; out: '("a blah" "b foo")
+;;
+(define (alist->sdat adat)
+  (map (lambda (dat)
+	 (conc (car dat) " " (cdr dat)))
+       adat))
+
+;; adat is the incoming alist, aspec is the mapping
+;; from incoming key to the artifact key (usually one
+;; letter to keep data tight) see the artifactspec at the
+;; top of this file
+;;
+;; NOTE: alists can contain multiple instances of the same key (supported fine by artifacts)
+;;       but you (obviously I suppose) cannot use alist-ref to access those entries.
+;;
+(define (alist->artifact adat aspec #!key (ptype #f)(no-d #f))
+  (let* ((artifact-type (or ptype
+			    (alist-ref 'T adat) ;; can provide in the incoming alist
+			    #f))
+	 (artifact-spec (if artifact-type            ;; alist of external-key -> key
+			    (or (alist-ref artifact-type aspec) '())
+			    (if (null? aspec)
+				'()
+				(cdar aspec)))) ;; default to first one if nothing specified
+	 (new-alist (map (lambda (dat)
+			   (let* ((key    (car dat))
+				  (val    (cdr dat))
+				  (newkey (or (alist-ref key artifact-spec)
+					      key)))
+			     (cons newkey (escape-data (conc val))))) ;; convert all incoming data (symbols, numbers etc.) to a string and then escape newlines.
+			 adat))
+	 (new-with-type (if (alist-ref 'T new-alist)
+			    new-alist
+			    (cons `(T . ,artifact-type) new-alist)))
+	 (with-d-card   (if (or no-d ;; no timestamp wanted
+				(alist-ref 'D new-with-type))
+			    new-with-type
+			    (cons `(D . ,(current-seconds))
+				  new-with-type))))
+    (add-z-card
+     (alist->sdat with-d-card))))
+
+;;======================================================================
+;;  D B   Q U E U E   I N T E R F A C E
+;;======================================================================
+
+;; artifacts (
+;;   id SERIAL PRIMARY KEY,
+;;   uuid TEXT NOT NULL,
+;;   parent_uuid TEXT default '',
+;;   artifact_type INTEGER DEFAULT 0,
+;;   group_id INTEGER NOT NULL,
+;;   artifact TEXT NOT NULL
+
+;; schema is list of SQL statements - can be used to extend db with more tables
+;;
+(define (open-queue-db dbpath dbfile #!key (schema '()))
+  (let* ((dbfname  (conc dbpath "/" dbfile))
+	 (dbexists (if (file-exists? dbfname) #t (begin (create-directory dbpath #t) #f)))
+	 (db       (open-database dbfname)))
+    ;; (set-busy-handler! (dbi:db-conn db) (busy-timeout 10000))
+    (if (not dbexists) ;; NOTE: In the archive we allow duplicates and other messiness. 
+	(for-each
+	 (lambda (stmt)
+	   (execute db stmt))
+	 (cons "CREATE TABLE IF NOT EXISTS artifacts
+                          (id           INTEGER PRIMARY KEY,
+                           group_id     INTEGER NOT NULL,
+                           uuid         TEXT NOT NULL,
+                           parent_uuid  TEXT TEXT DEFAULT '',
+                           artifact_type     TEXT NOT NULL,
+                           artifact          TEXT NOT NULL,
+                           processed    INTEGER DEFAULT 0)"
+		   schema))) ;; 0=not processed, 1=processed, 2... for expansion
+    db))
+
+(define (add-to-queue db artifact uuid artifact-type parent-uuid group-id)
+  (execute db "INSERT INTO artifacts (uuid,parent_uuid,artifact_type,artifact,group_id)
+                   VALUES(?,?,?,?,?);" ;; $1,$2,$3,$4,$5);"
+	    uuid
+	    (if parent-uuid parent-uuid "");; use null string as placeholder for no parent uuid.
+	    (if artifact-type (conc artifact-type) "") 
+	    artifact
+	    group-id))
+
+;; given all needed parameters create a artifact and store it in the queue
+;;  procs is an alist that maps artifact-type to a function that takes a list of artifact params
+;;  in data and returns the uuid and artifact
+;;
+(define (create-and-queue conn procs artifact-type parent-uuid group-id data)
+  (let ((proc (alist-ref artifact-type procs)))
+    (if proc
+	(let-values (( (uuid artifact) (proc data) ))
+	  (add-to-queue conn artifact uuid artifact-type parent-uuid group-id)
+	  uuid)
+	#f)))
+
+;; given uuid get artifact, if group-id is specified use it (reduces probablity of
+;;     being messed up by a uuid collision)
+;;
+(define (lookup-by-uuid db artifact-uuid group-id)
+  (if group-id
+      (get-one db "SELECT artifact FROM artifacts WHERE group_id=? AND uuid=?;" group-id artifact-uuid)
+      (get-one db "SELECT artifact FROM artifacts WHERE uuid=?;" artifact-uuid)))
+      
+;; find a packet by its id
+;;
+(define (lookup-by-id db id)
+  (get-one db "SELECT artifact FROM artifacts WHERE id=?;" id))
+
+
+;;======================================================================
+;;  P R O C E S S   P K T S
+;;======================================================================
+
+;; given a list of field values pulled from the queue db generate a list
+;; of dartifact's
+;;
+(define (dblst->dartifacts lst . altmap)
+  (let* ((maplst (if (null? altmap)
+		     '(id group-id uuid parent artifact-type artifact processed)
+		     altmap))
+	 (res (map cons maplst lst))) ;; produces list of pairs, i.e an alist
+    (cons `(aartifact . ,(artifact->alist (alist-ref 'artifact res)))
+	  res)))
+
+;; NB// ptypes is a list of symbols, '() or #f find all types
+;;
+(define (get-dartifacts db ptypes group-id parent-uuid #!key (uuid #f))
+  (let* ((ptype-qry (if (and ptypes
+			     (not (null? ptypes)))
+			(conc " IN ('" (string-intersperse (map conc ptypes) "','") "')")
+			(conc " LIKE '%' ")))
+	 (rows      (get-rows
+		     db
+		     (conc
+		      "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed FROM artifacts
+                         WHERE artifact_type " ptype-qry " AND group_id=?
+                         AND processed=0 "
+			 (if parent-uuid (conc "AND parent_uuid='" parent-uuid "' ") "")
+			 (if uuid        (conc "AND        uuid='"        uuid "' ") "")
+			 "ORDER BY id DESC;")
+		     group-id)))
+    (map dblst->dartifacts (map vector->list rows))))
+
+;; get N artifacts not yet processed for group-id
+;;
+(define (get-not-processed-artifacts db group-id artifact-type limit offset)
+  (map dblst->dartifacts
+       (map vector->list
+	    (get-rows
+	     db
+	     "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed FROM artifacts
+                WHERE artifact_type = ? AND group_id = ? AND processed=0
+                LIMIT ? OFFSET ?;"
+	     (conc artifact-type) ;; convert symbols to string
+	     group-id
+	     limit
+	     offset
+	     ))))
+
+;; given a uuid, get not processed child artifacts 
+;;
+(define (get-related db group-id uuid)
+  (map dblst->dartifacts
+       (get-rows
+	db
+	"SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed FROM artifacts
+           WHERE parent_uuid=? AND group_id=? AND processed=0;"
+	uuid group-id)))
+
+;; generic artifact processor
+;;
+;; find all packets in group-id of type in ptypes and apply proc to artifactdat
+;;
+(define (process-artifacts conn group-id ptypes parent-uuid proc)
+  (let* ((artifacts (get-dartifacts conn ptypes group-id parent-uuid)))
+    (map proc artifacts)))
+
+;; criteria is an alist ((k . valpatt) ...)
+;;   - valpatt is a regex
+;;   - ptypes is a list of types (symbols expected)
+;;   match-type: 'any or 'all
+;;
+(define (find-artifacts db ptypes criteria #!key (processed #f)(match-type 'any)(artifact-spec #f)) ;; processed=#f, don't use, else use
+  (let* ((artifacts (get-dartifacts db ptypes 0 #f))
+	 (match-rules (lambda (artifactdat) ;; returns a list of matching rules
+			(filter (lambda (c)
+				  ;; (print "c: " c)
+				  (let* ((ctype (car c)) ;; card type
+					 (rx    (cdr c)) ;; card pattern
+					 ;; (t     (alist-ref 'artifact-type artifactdat))
+					 (artifact   (alist-ref 'artifact artifactdat))
+					 (aartifact  (artifact->alist artifact))
+					 (cdat  (alist-ref ctype aartifact)))
+				    ;; (print "cdat: " cdat) ;; " aartifact: " aartifact)
+				    (if cdat
+					(string-match rx cdat)
+					#f)))
+				criteria)))
+	 (res         (filter (lambda (artifactdat)
+				(if (null? criteria) ;; looking for all artifacts
+				    #t
+				    (case match-type
+				      ((any)(not (null? (match-rules artifactdat))))
+				      ((all)(eq? (length (match-rules artifactdat))(length criteria)))
+				      (else
+				       (print "ERROR: bad match type " match-type ", expecting any or all.")))))
+			      artifacts)))
+    (if artifact-spec
+	(dartifacts->alists res artifact-spec)
+	res)))
+
+;; get descendents of parent-uuid
+;;
+;; NOTE: Should be doing something like the following:
+;;
+;; given a uuid, get not processed child artifacts 
+;; processed:
+;;    #f => get all
+;;     0 => get not processed
+;;     1 => get processed
+;;
+(define (get-ancestors db group-id uuid #!key (processed #f))
+  (map dblst->dartifacts
+       (map vector->list
+	    (get-rows
+	     db
+	     (conc
+	      "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed 
+                FROM artifacts
+                 WHERE uuid IN 
+                     (WITH RECURSIVE
+                       tree(uuid,parent_uuid)
+                        AS
+                        (
+                           SELECT uuid, parent_uuid
+                           FROM artifacts
+                           WHERE uuid = ?
+                           UNION ALL
+                           SELECT t.uuid, t.parent_uuid
+                           FROM artifacts t
+                           JOIN tree ON t.uuid = tree.parent_uuid
+                        )
+	              SELECT uuid FROM tree)
+	    AND group_id=?" (if processed (conc " AND processed=" processed) "") ";")
+	     uuid group-id))))
+
+;; Untested
+;;
+(define (get-descendents db group-id uuid #!key (processed #f))
+  (map dblst->dartifacts
+       (map vector->list
+	    (get-rows
+	     db
+	     (conc
+	      "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed 
+                FROM artifacts
+                 WHERE uuid IN 
+                     (WITH RECURSIVE
+                       tree(uuid,parent_uuid)
+                        AS
+                        (
+                           SELECT uuid, parent_uuid
+                           FROM artifacts
+                           WHERE uuid = ?
+                           UNION ALL
+                           SELECT t.uuid, t.parent_uuid
+                           FROM artifacts t
+                           JOIN tree ON t.parent_uuid = tree.uuid
+                        )
+	              SELECT uuid FROM tree)
+	    AND group_id=?" (if processed (conc " AND processed=" processed) "") ";")
+	     uuid group-id))))
+
+;; look up descendents based on given info unless passed in a list via inlst
+;;
+;; (define (get-last-descendent db group-id uuid #!key (processed #f)(inlst #f))
+;;   (let ((descendents (or inlst (get-descendents db group-id uuid processed: processed))))
+;;     (if (null? descendents)
+;; 	#f
+;; 	(last descendents))))
+
+;;======================================================================
+;;  A R C H I V E S - always to a sqlite3 db 
+;;======================================================================
+
+;; open an archive db
+;; path: archive-dir/<year>/month.db
+;;
+#;(define (archive-open-db archive-dir)
+  (let* ((curr-time (seconds->local-time (current-seconds)))
+	 (dbpath    (conc archive-dir "/" (time->string curr-time "%Y")))
+	 (dbfile    (conc dbpath "/" (time->string curr-time "%m") ".db"))
+	 (dbexists (if (file-exists? dbfile) #t (begin (create-directory dbpath #t) #f))))
+    (let ((db (open-database dbfile)))
+      ;; (set-busy-handler! db (busy-timeout 10000))
+      (if (not dbexists) ;; NOTE: In the archive we allow duplicates and other messiness. 
+	  (execute db "CREATE TABLE IF NOT EXISTS artifacts
+                          (id           INTEGER,
+                           group_id     INTEGER,
+                           uuid         TEXT,
+                           parent_uuid  TEXT,
+                           artifact_type     TEXT,
+                           artifact          TEXT,
+                           processed    INTEGER DEFAULT 0)"))
+      db)))
+
+;; turn on transactions! otherwise this will be painfully slow
+;;
+#;(define (write-archive-artifacts src-db db artifact-ids)
+  (let ((artifacts (get-rows
+	       src-db
+	       (conc "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact FROM artifacts WHERE id IN ("
+		     (string-intersperse (map conc artifact-ids) ",") ")"))))
+    ;; (dbi:with-transaction
+    ;;  db
+     (lambda ()
+       (for-each
+	(lambda (artifact)
+	  (apply execute  db "INSERT INTO artifacts (id,group_id,uuid,parent_uuid,artifact_type,artifact)
+                               VALUES (?,?,?,?,?,?)"
+		 artifact))
+	artifacts)))) ;; )
+
+;; given a list of uuids and lists of uuids move all to
+;; the sqlite3 db for the current archive period
+;;
+#;(define (archive-artifacts conn artifact-ids archive-dir)
+  (let ((db (archive-open-db archive-dir)))
+    (write-archive-artifacts conn db artifact-ids)
+    (finalize! db))
+  ;; (pg:with-transaction
+  ;;  conn
+  ;; (lambda ()
+     (for-each
+      (lambda (id)
+	(get-one
+	 conn
+	 "DELETE FROM artifacts WHERE id=?" id))
+      artifact-ids)) ;; ))
+
+;; given a list of ids mark all as processed
+;;
+(define (mark-processed conn artifact-ids)
+  ;; (pg:with-transaction
+   ;; conn
+   ;; (lambda ()
+     (for-each
+      (lambda (id)
+	(get-one
+	 conn
+	 "UPDATE artifacts SET processed=1 WHERE id=?;" id))
+      artifact-ids)) ;; x))
+
+;; a generic artifact getter, gets from the artifacts db
+;;
+(define (get-artifacts conn ptypes)
+  (let* ((ptypes-str    (if (null? ptypes)
+			    ""
+			    (conc " WHERE artifact_type IN ('" (string-intersperse ptypes ",") "') ")))
+	 (qry-str       (conc "SELECT id,group_id,uuid,parent_uuid,artifact_type,artifact,processed FROM artifacts" ptypes-str)))
+    (map vector->list (get-rows conn qry-str))))
+
+;; make a report of the artifacts in the db
+;; ptypes of '() gets all artifacts
+;; display-fields
+;;
+(define (make-report dest conn artifactspec display-fields . ptypes)
+  (let* (;; (conn          (dbi:db-conn (s:db)))
+	 (all-rows      (get-artifacts conn ptypes))
+	 (all-artifacts      (flatten-all
+			 all-rows
+			 artifactspec
+			 'id 'group-id 'uuid 'parent 'artifact-type 'artifact 'processed))
+	 (by-uuid       (let ((ht (make-hash-table)))
+			  (for-each
+			   (lambda (artifact)
+			     (let ((uuid (alist-ref 'uuid artifact)))
+			       (hash-table-set! ht uuid artifact)))
+			   all-artifacts)
+			  ht))
+	 (by-parent     (let ((ht (make-hash-table)))
+			  (for-each
+			   (lambda (artifact)
+			     (let ((parent (alist-ref 'parent artifact)))
+			       (hash-table-set! ht parent (cons artifact (hash-table-ref/default ht parent '())))))
+			   all-artifacts)
+			    ht))
+	 (oup           (if dest (open-output-file dest) (current-output-port))))
+    
+    (with-output-to-port
+	oup
+      (lambda ()
+	(print "digraph megatest_state_status {
+  // ranksep=0.05
+  rankdir=LR;
+  node [shape=\"box\"];
+")
+	;; first all the names
+	(for-each
+	 (lambda (artifact)
+	   (let* ((uuid        (alist-ref 'uuid artifact))
+		  (shortuuid   (substring uuid 0 4))
+		  (type        (alist-ref 'artifact-type artifact))
+		  (processed   (alist-ref 'processed artifact)))
+	     
+	     (print "\"" uuid "\" [label=\"" shortuuid ", ("
+		    type ", "
+		    (if processed "processed" "not processed") ")")
+	     (for-each
+	      (lambda (key-field)
+		(let ((val (alist-ref key-field artifact)))
+		  (if val
+		      (print key-field "=" val))))
+	      display-fields)
+	     (print "\" ];")))
+	 all-artifacts)
+	;; now for parent-child relationships
+	(for-each
+	 (lambda (artifact)
+	   (let ((uuid   (alist-ref 'uuid artifact))
+		 (parent (alist-ref 'parent artifact)))
+	     (if (not (equal? parent ""))
+		 (print "\"" parent "\" -> \"" uuid"\";"))))
+	 all-artifacts)
+
+	(print "}")
+	))
+    (if dest
+	(begin
+	  (close-output-port oup)
+	  (system "dot -Tpdf out.dot -o out.pdf")))
+    
+    ))
+
+;;======================================================================
+;; Read ref artifacts into a vector < laststr hash table > 
+;;======================================================================
+
+
+
+;;======================================================================
+;; Read/write packets to files (convience functions)
+;;======================================================================
+
+;; write alist to a artifact file
+;;
+(define (write-alist->artifact targdir dat #!key (artifactspec '())(ptype #f))
+  (let-values (((uuid artifact)(alist->artifact dat artifactspec ptype: ptype)))
+    (with-output-to-file (conc targdir "/" uuid ".artifact")
+      (lambda ()
+	(print artifact)))
+    uuid)) ;; return the uuid
+
+;; read artifact into alist
+;;
+(define (read-artifact->alist artifact-file #!key (artifactspec #f))
+  (artifact->alist (with-input-from-file
+		  artifact-file
+		read-string)
+	      artifactspec: artifactspec))
+
+;;======================================================================
+;; File utils, stuff useful for file management
+;;======================================================================
+
+(define (file-get-sha1 fname)
+  (let* ((sha1-res (run/strings (sha1sum ,fname))))
+    (car (string-split (car sha1-res)))))
+
+(define (link-or-copy srcf destf)
+  (or (handle-exceptions
+       exn
+       #f
+       (file-link srcf destf))
+      (if (file-exists? destf)
+	  (print "NOTE: destination already exists, skipping copy.")
+	  (copy-file srcf destf))))
+
+;; (define (files-diff file1 file2)
+;;   (let* ((diff-res (with-input-from-port
+;; 		       (run/port (diff "-q" ,file1 ,file2))
+;; 		     (lambda ()
+;; 		       (let* ((res (read-line)))
+;; 			 (read-lines)
+;; 			 res)))))
+;;     (car (string-split sha1-res))))
+;; 
+
+
+(define (check-same file1 file2)
+  (cond
+   ((not (and (file-exists? file1)(file-exists? file2))) #f)
+   ((not (equal? (file-size file1)(file-size file2))) #f)
+   (else
+    (let-values (((status run-ok process-id)
+		  (run (diff "-q" ,file1 ,file2))))
+      status))))
+
+(define *pcache* (make-hash-table))
+(define (get-device dir)
+  (let ((indat (or (hash-table-ref/default *pcache* dir #f)
+		   (let* ((inp (open-input-pipe (conc "df --output=source \""dir"\"")))
+			  (res (read-lines inp)))
+		     (close-input-port inp)
+		     (hash-table-set! *pcache* dir res)
+		     res))))
+    (cadr indat)))
+
+(define (same-partition? dir1 dir2)
+  (equal? (get-device dir1)(get-device dir2)))
+
+(define (link-if-same-partition file1 file2)
+  (let* ((dir1 (pathname-directory file1))
+	 (dir2 (pathname-directory file2))
+	 (f1   (pathname-file file1))
+	 (f2   (pathname-file file2)))
+    (if (same-partition? dir1 dir2)
+	(let* ((tmpname (conc "."f2"-"(current-seconds))))
+	  ;; this steps needs to be executed as actual user    
+	  (move-file file2 (conc dir1 "/" tmpname))
+	  (file-link file1 file2)
+	  (delete-file (conc dir1 "/" tmpname))))))
+
+(define (uuid-first-two-letters sha1sum)
+  (substring sha1sum 0 2))
+
+(define (uuid-remaining-letters sha1sum)
+  (let ((slen (string-length sha1sum)))
+    (substring sha1sum 2 slen)))
+
+(define (archive-dest destd sha1sum)
+  (let* ((subdir         (uuid-first-two-letters sha1sum)) ;; (substring sha1sum 0 2))
+	 ;; (slen           (string-length sha1sum))
+	 (rem            sha1sum #;(uuid-remaining-letters sha1sum)) ;; (substring sha1sum 3 slen))
+	 (full-dest-dir  (conc destd"/"subdir))
+	 (full-dest-file (conc full-dest-dir"/"rem)))
+    (if (not (directory-exists? full-dest-dir))
+	(create-directory full-dest-dir #t))
+    full-dest-file))
+
+(define (write-to-archive data destd #!optional (nextnum #f))
+  (let* ((sha1sum    (calc-sha1 data))
+	 (full-dest  (conc (archive-dest destd sha1sum)
+			   (if nextnum (conc "."nextnum) ""))))
+    (if (file-exists? full-dest)
+	(if (equal? (string-intersperse (with-input-from-file full-dest read-lines) "\n")
+		    data)
+	    (begin
+	      ;; (print "INFO: data already exists in "full-dest" and is identical")
+	      sha1sum)
+	    (let ((nextnum (if nextnum (+ nextnum 1) 0)))
+	      (print "WARN: data already exists in "full-dest" but is different! Trying again...")
+	      (write-to-archive data destd nextnum)))
+	(begin
+	  (with-output-to-file
+	      full-dest
+	    (lambda ()
+	      (print data)))
+	  sha1sum)))) ;; BUG? Does print munge data?
+
+;; copy srcf with sha1sum aabc... to aa/bc...
+;;
+(define (archive-copy srcf destd sha1sum)
+  (let* ((full-dest-file (archive-dest destd sha1sum)))
+    (let loop ((trynum 0))
+      (let ((dest-name (if (> trynum 0)
+			   (conc full-dest-file"-"trynum)
+			   full-dest-file)))
+	(cond
+	 ((not (file-exists? srcf)) #f) ;; this should be an error?
+	 ((and (file-exists? srcf)
+	       (file-exists? dest-name))
+	  (if (check-same srcf dest-name)
+	      (link-if-same-partition dest-name srcf)
+	      (loop (+ trynum 1)))) ;; collisions are rare, this protects against them
+	 ((not (file-exists? dest-name))
+	  (link-or-copy srcf dest-name))
+	 (else #f))))))
+
+;; multi-glob
+(define (multi-glob globstrs inpath)
+  ;; (print "multi-glob: "globstrs", "inpath)
+  (if (equal? inpath "")
+      globstrs
+      (let* ((parts     (string-split inpath "/" #t))
+	     (nextpart  (car parts))
+	     (remaining (string-intersperse (cdr parts) "/")))
+	(if (and (equal? nextpart "") ;; this must be a leading / meaning root directory
+	         (null? globstrs))
+	    (multi-glob '("/") remaining)
+	    (begin
+	      ;; (print "nextpart="nextpart", remaining="remaining)
+	      (apply append
+              (map (lambda (gstr)
+                      (let* ((pathstr  (conc gstr"/"nextpart))
+		   	     (pathstrs (glob pathstr)))
+		          ;; (print "pathstr="pathstr)
+		          (multi-glob pathstrs remaining)))
+	               globstrs)))))))
+	     
+
+;; perm[/user:group]:
+;;   DDD - octal perm (future expansion)
+;;   -   - use umask/defacto perms (i.e. don't actively do anything)
+;;   x   - mark as executable
+;;
+;; Cards:
+;;   file:      f perm fname
+;;   directory: d perm fname artifactid
+;;   link:      l perm lname destpath
+;;
+;;   NOTE: cards are kept as (C . "value")
+;;
+;; given a directory path, ignore list and artifact store (hash-table):
+;;  1. create sha1 tree at dest (e.g. aa/b3a7 ...)
+;;  2. create artifact for each dir
+;;    - cards for all files
+;;    - cards for files that are symlinks or executables
+;;  3. return (artifactid . artifact)
+;;
+;; NOTES:
+;;   Use destdir of #f to not create sha1 tree
+;;   Hard links will be used if srcdir and destdir appear to be same partion
+;;
+;; (alist->artifact adat aspec #!key (ptype #f))
+;;
+;;
+;; (load "../../artifacts/artifacts.scm")(import big-chicken srfi-69 artifacts)(define dirdat (make-hash-table))
+;; (capture-dir ".." ".." "/tmp/junk" '() dirdat)
+;;
+;; [procedure] (file-type FILE [LINK [ERROR]])
+;; Returns the file-type for FILE, which should be a filename, a file-descriptor or a port object. If LINK is given and true, symbolic-links are not followed:
+;; 
+;;  regular-file
+;;  directory
+;;  fifo
+;;  socket
+;;  symbolic-link
+;;  character-device
+;;  block-device
+;; Note that not all types are supported on every platform. If ERROR is given and false, then file-type returns #f if the file does not exist; otherwise, it signals an error.
+;; 
+;; 
+(define (capture-dir curr-dir src-dir dest-dir ignore-list artifacts all-seen)
+  (let* ((dir-dat (directory-fold
+		   (lambda (fname res) ;; res is a list of artifact cards
+		     (let* ((fullname   (conc curr-dir"/"fname)))
+		       ;; (print "INFO: processing "fullname)
+		       (if (hash-table-ref/default all-seen fullname #f) ;; something circular going on
+			   (begin
+			     (print "WARNING: possible circular link(s) "fullname)
+			     res)
+			   (let* ((ftype (file-type fullname #t #f)))
+			     (hash-table-set! all-seen fullname ftype)
+			     (cons
+			      (case ftype ;; get the card
+			       ((directory) ;; (directory? fullname)
+				(let* ((new-curr-dir (conc curr-dir"/"fname))
+				       (new-src-dir  (conc src-dir"/"fname)))
+				  (let* ((dir-dat (capture-dir new-curr-dir new-src-dir
+							       dest-dir ignore-list artifacts all-seen))
+					 (a-id (car dir-dat))
+					 (artf (cdr dir-dat)))
+				    (hash-table-set! artifacts a-id artf)
+				    (cons 'd (conc "- "a-id" "fname))))) ;; the card
+			       ((symbolic-link) ;; (symbolic-link? fullname)
+				(let ((ldest (read-symbolic-link fullname)))
+				  (cons 'l (conc "- "fname"/"ldest)))) ;; delimit link name from dest with /
+			       ((regular-file) ;; must be a file
+				(let* ((start      (current-seconds))
+				       (sha1sum    (file-get-sha1 fullname))
+				       (perms      (if (file-executable? fullname) "x" "-")))
+				  (let ((runtime (- (current-seconds) start)))
+				    (if (> runtime 1)
+					(print "INFO: file "fullname" took "runtime" seconds to calculate sha1.")))
+				  (if dest-dir
+				      (archive-copy fullname dest-dir sha1sum))
+				  (cons 'f (conc perms " "sha1sum" "fname))))
+			       (else
+				(print "WARNING: file "fullname" of type "ftype" is NOT supported and will converted to empty file.")
+				(let* ((sha1sum (write-to-archive "" dest-dir)))
+				  (cons 'f (conc "- "sha1sum" "fname)))))
+			      res)))))
+		       '() src-dir #:dotfiles? #t))) ;; => (values srcdir_artifact sub_artifacts_list)
+    ;; (print "dir-dat: " dir-dat)
+    (let-values (((a-id artf)
+		  (alist->artifact dir-dat '() ptype: 'd no-d: #t)))
+      (hash-table-set! artifacts a-id artf)
+      (cons a-id artf))))
+
+;; maybe move this into artifacts?
+;;
+;; currently moves *.artifact into a bundle and moves the artifacts into attic
+;; future: move artifacts under 1 meg in size into bundle up to 10 meg in size
+;;
+(define (artifact-rollup bundle-dir) ;; cfg storepath)
+  ;; (let* ((bundle-dir (calc-bundle-dir cfg storepath)))
+    (let* ((bundles   (glob (conc bundle-dir"/*.bundle")))
+	   (artifacts (glob (conc bundle-dir"/*.artifact"))))
+      (if (> (length artifacts) 30) ;; rollup only if > 30 artifacts
+	  ;; if we have unbundled artifacts, bundle them
+	  (let* ((ht     (read-artifacts-into-hash #f artifacts: artifacts))
+		 (bundle (hash-of-artifacts->bundle ht)))
+	    (write-bundle bundle bundle-dir)
+	    (create-directory (conc bundle-dir"/attic") #t)
+	    (for-each
+	     (lambda (full-fname)
+	       (let* ((fname   (pathname-strip-directory full-fname))
+		      (newname (conc bundle-dir"/attic/"fname)))
+		 (move-file full-fname newname #t)))
+	     artifacts)
+	    (conc "bundled "(length artifacts)))
+	  "not enough artifacts to bundle")))
+
+;; if destfile is a directory then calculate the sha1sum of the bundle and store it
+;; by <sha1sum>.bundle
+;;
+;; incoming dat is pure text (bundle already sorted and appended:
+;;
+(define (write-bundle bdl-data destdir)
+  (let* ((bdl-uuid  (calc-sha1 bdl-data)))
+    (with-output-to-file
+	(conc destdir"/"bdl-uuid".bundle")
+      (lambda ()
+	(print bdl-data)))))
+
+;; minimal (and hopefully fast) artifact reader
+;; TODO: Add check of shar sum.
+;;
+(define (minimal-artifact-read fname)
+  (let* ((indat (with-input-from-file fname read-lines)))
+    (if (null? indat)
+	(values #f (conc "did not find an artifact in "fname))
+	(let* ((zcard (last indat))
+	       (cardk (substring zcard 0 1))
+	       (cardv (substring zcard 2 (string-length zcard))))
+	  (if (equal? cardk "Z")
+	      (values cardv (string-intersperse indat "\n"))
+	      (values #f (conc fname" is not a valid artifact")))))))
+
+;; read artifacts from directory into hash
+;; NOTE: support for max-count not implemented yet
+;;
+(define (read-artifacts-into-hash dir #!key (artifacts #f) (max-count #f)(ht #f))
+  (let* ((artifacts (or artifacts
+			(glob (conc dir"/*.artifact"))))
+	 (ht        (or ht (make-hash-table))))
+    (for-each
+     (lambda (fname)
+       (let-values (((uuid afct)
+		     (minimal-artifact-read fname)))
+	 (hash-table-set! ht uuid afct)))
+     artifacts)
+    ht))
+
+;; ht is:
+;;   uuid => artifact text
+;; use write-bundle to put result into a bundle file
+;;
+(define (hash-of-artifacts->bundle ht)    
+  (fold (lambda (k res)
+	  (let* ((v (hash-table-ref ht k)))
+	    (if res
+		(conc res"\n"v)
+		v)))
+	#f
+	(sort (hash-table-keys ht) string<=?)))
+
+;; minimal artifact to alist
+;;
+(define (minimal-artifact->alist afact)
+  (let* ((lines   (string-split afact "\n")))
+    (map (lambda (a)
+	   (let* ((key (string->symbol (substring a 0 1)))
+		  (sl  (string-length a))
+		  (val (if (> sl 2)
+			   (substring a 2 sl)
+			   "")))
+	     (cons key val)))
+	 lines)))
+
+;; some accessors for common cards
+(define (afact-get-D afact)
+  (let ((dval (alist-ref 'D afact)))
+    (if dval
+	(string->number dval)
+	#f)))
+
+(define (afact-get-T afact) ;; get the artifact type as a symbol
+  (let ((val (alist-ref 'T afact)))
+    (if val
+	(string->symbol val)
+	val)))
+
+(define (afact-get-Z afact)
+  (alist-ref 'Z afact))
+
+(define (afact-get afact key default)
+  (or (alist-ref key afact)
+      default))
+
+(define (afact-get-number/default afact key default)
+  (let ((val (alist-ref key afact)))
+    (if val
+	(or (string->number val) default) ;; seems wrong
+	default)))
+
+;; bundles are never big and reading into memory for processing is fine
+;;
+(define (read-bundle srcfile #!optional (mode 'uuid-raw))
+  (let* ((indat (with-input-from-file srcfile read-lines)))
+    (let loop ((tail indat)
+	       (dat '())  ;; artifact being extracted
+	       (res '())) ;; list of artifacts
+      (if (null? tail)
+	  (reverse res) ;; last dat should be empty list
+	  (let* ((curr-line (car tail)))
+	    (let-values (((ctype cdata)
+			  (card->type/value curr-line)))
+	      (let* ((is-z-card (eq? 'Z ctype))
+		     (new-dat   (cons (case mode
+					((uuid-raw) curr-line)
+					(else       (cons ctype cdata)))
+				      dat)))
+		(if is-z-card
+		    (loop (cdr tail) ;; done with this artifact
+			  '()
+			  (cons (case mode
+				  ((uuid-raw) (cons cdata (string-intersperse (reverse new-dat) "\n")))
+				  (else       (reverse new-dat)))
+				res))
+		    (loop (cdr tail)
+			  new-dat
+			  res)))))))))
+
+
+;; find all .bundle and .artifacts files in bundle-dir
+;; and inport them into sqlite handle adb
+;; 
+(define (refresh-artifacts-db adb bundle-dir)
+  (let* ((bundles   (glob (conc bundle-dir"/*.bundle")))
+	 (artifacts (glob (conc bundle-dir"/*.artifact")))
+	 (uuids     (get-all-uuids adb 'hash)))
+    (with-transaction
+     adb
+     (lambda ()
+       (for-each
+	(lambda (bundle-file)
+	  ;; (print "Importing artifacts from "bundle-file)
+	  (let* ((bdat (read-bundle bundle-file 'uuid-raw))
+		 (count 0)
+		 (inc   (lambda ()(set! count (+ count 1)))))
+	    (for-each
+	     (lambda (adat)
+	       (match
+		adat
+		((zval . artifact)
+		 (if (not (hash-table-exists? uuids zval))
+		     (begin
+		       ;; (print "INFO: importing new artifact "zval" from bundle "bundle-file)
+		       (inc)
+		       (execute adb "INSERT INTO artifacts (uuid, artifact) VALUES (?,?);"
+				zval artifact)
+		       (hash-table-set! uuids zval #t))))
+		(else
+		 (print "ERROR: Bad artifact data "adat))))
+	     bdat)
+	    (print "INFO: imported "count" artifacts from "bundle-file)))
+	bundles)
+       (for-each
+	(lambda (artifact-file)
+	  ;; (print "Importing artifact from "artifact-file)
+	  (let-values (((uuid artifact) (minimal-artifact-read artifact-file)))
+	    (if uuid
+		(if (not (hash-table-exists? uuids uuid))
+		    (begin
+		      ;; (print "INFO: importing new artifact "uuid" from "artifact-file)
+		      (execute adb "INSERT INTO artifacts (uuid, artifact) VALUES (?,?);"
+			       uuid artifact)
+		      (hash-table-set! uuids uuid #t)))
+		(print "Bad artifact in "artifact-file))))
+	artifacts)))))
+
+;;======================================================================
+;;  Artifacts db cache
+;;======================================================================
+
+;; artifacts
+;;   id SERIAL PRIMARY KEY,
+;;   uuid TEXT NOT NULL,
+;;   artifact TEXT NOT NULL
+;;
+;; parents
+;;   id INTEGER REFERENCES artids.id,  -- 
+;;   parent_id  REFERENCES artids.id
+;;
+;; schema is list of SQL statements - can be used to extend db with more tables
+;;
+(define (open-artifacts-db dbpath dbfile #!key (schema '()))
+  (let* ((dbfname  (conc dbpath "/" dbfile))
+	 (dbexists (if (file-exists? dbfname) #t (begin (create-directory dbpath #t) #f)))
+	 (adb      (open-database dbfname)))
+    (set-busy-handler! adb (make-busy-timeout 10000))
+    (execute adb "PRAGMA synchronous = 0;")
+    (if (not dbexists)
+	(with-transaction
+	 adb
+	 (lambda ()
+	   (for-each
+	    (lambda (stmt)
+	      (execute adb stmt))
+	    (append `("CREATE TABLE IF NOT EXISTS artifacts
+                          (id           INTEGER PRIMARY KEY,
+                           uuid         TEXT NOT NULL,
+                           artifact     TEXT NOT NULL)"
+		      
+		      "CREATE TABLE IF NOT EXISTS parents
+                          (id INTEGER REFERENCES artifacts(id) NOT NULL,
+                           parent_id INTEGER REFERENCES artifacts(id) NOT NULL)")
+		    schema)))))
+    adb))
+
+(define (generate-year-month-name #!optional (seconds #f))
+  (let* ((curr-time (seconds->local-time (or seconds (current-seconds)))))
+    (time->string curr-time "%Y%m")))
+
+;; I don't like this function. TODO: remove the
+;; mode and option to return ht. Use instead the
+;; get-all-artifacts below
+;;
+(define (get-all-uuids adb #!optional (mode #f))
+  (let* ((res (fold-row
+	       (lambda (res uuid)
+		 (cons uuid res))
+	       '()
+	       adb
+	       "SELECT uuid FROM artifacts;")))
+    (case mode
+      ((hash)
+       (let* ((ht (make-hash-table)))
+	 (for-each
+	  (lambda (uuid)
+	    (hash-table-set! ht uuid #t))
+	  res)
+	 ht))
+      (else res))))
+
+;; returns raw artifacts (i.e. NOT alists but instead plain text)
+(define (get-all-artifacts adb)
+  (let* ((ht  (make-hash-table)))
+    (for-each-row
+     (lambda (id uuid artifact)
+       (hash-table-set! ht uuid `(,id ,uuid ,artifact)))
+     adb
+     "SELECT id,uuid,artifact FROM artifacts;")
+    ht))
+
+;; given a bundle-dir copy or create to /tmp and open
+;; the YYMM.db file and hand the handle to the given proc
+;; NOTE: we operate in /tmp/ to accomodate users on NFS
+;; where slamming Unix locks at an NFS filer can cause
+;; locking fails. Eventually this /tmp behavior will be
+;; configurable.
+;;
+(define (with-todays-adb bundle-dir proc)
+  (let* ((dbname   (conc (generate-year-month-name) ".db"))
+	 (destname (conc bundle-dir"/"dbname))
+	 (tmparea  (conc "/tmp/"(current-user-name)"-"(calc-sha1 bundle-dir)))
+	 (tmpname  (conc tmparea"/"dbname))
+	 (lockfile (conc destname".update-in-progress")))
+    ;; (print "with-todays-adb, bundle-dir: "bundle-dir", dbname: "dbname", destname: "destname",\n    tmparea: " tmparea", lockfile: "lockfile)
+    (if (not (file-exists? tmparea))(create-directory tmparea #t))
+    (let loop ((count 0))
+      (if (file-exists? lockfile)
+	  (if (< count 30) ;; aproximately 30 seconds
+	      (begin
+		(sleep 1)
+		(loop (+ 1 count)))
+	      (print "ERROR: "lockfile" exists, proceeding anyway"))
+	  (if (file-exists? destname)
+	      (begin
+		(copy-file destname tmpname #t)
+		(copy-file destname lockfile #t)))))
+    (let* ((adb  (open-artifacts-db tmparea dbname))
+	   (res  (proc adb)))
+      (finalize! adb)
+      (copy-file tmpname destname #t)
+      (delete-file* lockfile)
+      res)))
+
+) ;; module artifacts
+
+;; ATTIC
+

ADDED   artifacts/artifacts.setup
Index: artifacts/artifacts.setup
==================================================================
--- /dev/null
+++ artifacts/artifacts.setup
@@ -0,0 +1,11 @@
+;; Copyright 2007-2017, Matthew Welland.
+;;
+;;  This program is made available under the GNU GPL version 2.0 or
+;;  greater. See the accompanying file COPYING for details.
+;;
+;;  This program is distributed WITHOUT ANY WARRANTY; without even the
+;;  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+;;  PURPOSE.
+
+;;;; pkts.setup
+(standard-extension 'pkts "1.0")

ADDED   artifacts/artifactsrec.scm
Index: artifacts/artifactsrec.scm
==================================================================
--- /dev/null
+++ artifacts/artifactsrec.scm
@@ -0,0 +1,196 @@
+(define-syntax define-record-type
+  (syntax-rules ()
+    ((define-record-type type
+       (constructor constructor-tag ...)
+       predicate
+       (field-tag accessor . more) ...)
+     (begin
+       (define type
+         (make-record-type 'type '(field-tag ...)))
+       (define constructor
+         (record-constructor type '(constructor-tag ...)))
+       (define predicate
+         (record-predicate type))
+       (define-record-field type field-tag accessor . more)
+       ...))))
+
+; An auxilliary macro for define field accessors and modifiers.
+; This is needed only because modifiers are optional.
+
+(define-syntax define-record-field
+  (syntax-rules ()
+    ((define-record-field type field-tag accessor)
+     (define accessor (record-accessor type 'field-tag)))
+    ((define-record-field type field-tag accessor modifier)
+     (begin
+       (define accessor (record-accessor type 'field-tag))
+       (define modifier (record-modifier type 'field-tag))))))
+
+; Record types
+
+; We define the following procedures:
+; 
+; (make-record-type <type-name <field-names>)    -> <record-type>
+; (record-constructor <record-type<field-names>) -> <constructor>
+; (record-predicate <record-type>)               -> <predicate>
+; (record-accessor <record-type <field-name>)    -> <accessor>
+; (record-modifier <record-type <field-name>)    -> <modifier>
+;   where
+; (<constructor> <initial-value> ...)         -> <record>
+; (<predicate> <value>)                       -> <boolean>
+; (<accessor> <record>)                       -> <value>
+; (<modifier> <record> <value>)         -> <unspecific>
+
+; Record types are implemented using vector-like records.  The first
+; slot of each record contains the record's type, which is itself a
+; record.
+
+(define (record-type record)
+  (record-ref record 0))
+
+;----------------
+; Record types are themselves records, so we first define the type for
+; them.  Except for problems with circularities, this could be defined as:
+;  (define-record-type :record-type
+;    (make-record-type name field-tags)
+;    record-type?
+;    (name record-type-name)
+;    (field-tags record-type-field-tags))
+; As it is, we need to define everything by hand.
+
+(define :record-type (make-record 3))
+(record-set! :record-type 0 :record-type)	; Its type is itself.
+(record-set! :record-type 1 ':record-type)
+(record-set! :record-type 2 '(name field-tags))
+
+; Now that :record-type exists we can define a procedure for making more
+; record types.
+
+(define (make-record-type name field-tags)
+  (let ((new (make-record 3)))
+    (record-set! new 0 :record-type)
+    (record-set! new 1 name)
+    (record-set! new 2 field-tags)
+    new))
+
+; Accessors for record types.
+
+(define (record-type-name record-type)
+  (record-ref record-type 1))
+
+(define (record-type-field-tags record-type)
+  (record-ref record-type 2))
+
+;----------------
+; A utility for getting the offset of a field within a record.
+
+(define (field-index type tag)
+  (let loop ((i 1) (tags (record-type-field-tags type)))
+    (cond ((null? tags)
+           (error "record type has no such field" type tag))
+          ((eq? tag (car tags))
+           i)
+          (else
+           (loop (+ i 1) (cdr tags))))))
+
+;----------------
+; Now we are ready to define RECORD-CONSTRUCTOR and the rest of the
+; procedures used by the macro expansion of DEFINE-RECORD-TYPE.
+
+(define (record-constructor type tags)
+  (let ((size (length (record-type-field-tags type)))
+        (arg-count (length tags))
+        (indexes (map (lambda (tag)
+                        (field-index type tag))
+                      tags)))
+    (lambda args
+      (if (= (length args)
+             arg-count)
+          (let ((new (make-record (+ size 1))))
+            (record-set! new 0 type)
+            (for-each (lambda (arg i)
+			(record-set! new i arg))
+                      args
+                      indexes)
+            new)
+          (error "wrong number of arguments to constructor" type args)))))
+
+(define (record-predicate type)
+  (lambda (thing)
+    (and (record? thing)
+         (eq? (record-type thing)
+              type))))
+
+(define (record-accessor type tag)
+  (let ((index (field-index type tag)))
+    (lambda (thing)
+      (if (and (record? thing)
+               (eq? (record-type thing)
+                    type))
+          (record-ref thing index)
+          (error "accessor applied to bad value" type tag thing)))))
+
+(define (record-modifier type tag)
+  (let ((index (field-index type tag)))
+    (lambda (thing value)
+      (if (and (record? thing)
+               (eq? (record-type thing)
+                    type))
+          (record-set! thing index value)
+          (error "modifier applied to bad value" type tag thing)))))
+
+Records
+
+; This implements a record abstraction that is identical to vectors,
+; except that they are not vectors (VECTOR? returns false when given a
+; record and RECORD? returns false when given a vector).  The following
+; procedures are provided:
+;   (record? <value>)                -> <boolean>
+;   (make-record <size>)             -> <record>
+;   (record-ref <record> <index>)    -> <value>
+;   (record-set! <record> <index> <value>) -> <unspecific>
+;
+; These can implemented in R5RS Scheme as vectors with a distinguishing
+; value at index zero, providing VECTOR? is redefined to be a procedure
+; that returns false if its argument contains the distinguishing record
+; value.  EVAL is also redefined to use the new value of VECTOR?.
+
+; Define the marker and redefine VECTOR? and EVAL.
+
+(define record-marker (list 'record-marker))
+
+(define real-vector? vector?)
+
+(define (vector? x)
+  (and (real-vector? x)
+       (or (= 0 (vector-length x))
+	   (not (eq? (vector-ref x 0)
+		record-marker)))))
+
+; This won't work if ENV is the interaction environment and someone has
+; redefined LAMBDA there.
+
+(define eval
+  (let ((real-eval eval))
+    (lambda (exp env)
+      ((real-eval `(lambda (vector?) ,exp))
+       vector?))))
+
+; Definitions of the record procedures.
+
+(define (record? x)
+  (and (real-vector? x)
+       (< 0 (vector-length x))
+       (eq? (vector-ref x 0)
+            record-marker)))
+
+(define (make-record size)
+  (let ((new (make-vector (+ size 1))))
+    (vector-set! new 0 record-marker)
+    new))
+
+(define (record-ref record index)
+  (vector-ref record (+ index 1)))
+
+(define (record-set! record index value)
+  (vector-set! record (+ index 1) value))

ADDED   artifacts/tests/run.scm
Index: artifacts/tests/run.scm
==================================================================
--- /dev/null
+++ artifacts/tests/run.scm
@@ -0,0 +1,139 @@
+(use test)
+
+;; (use (prefix pkts pkts:))
+(use pkts (prefix dbi dbi:))
+;; (use trace)(trace sdat->alist pkt->alist)
+
+(if (file-exists? "queue.db")(delete-file "queue.db"))
+
+(test-begin "pkts and pkt archives")
+
+;;======================================================================
+;; Basic pkt creation, parsing and conversion routines
+;;======================================================================
+
+(test-begin "basic packets")
+(test #f '(A "This is a packet") (let-values (((t v)
+					       (card->type/value "A This is a packet")))
+				   (list t v)))
+(test #f "A A\nZ 664449e7299e0065a3e25c138ccef2df13ba291e"
+      (let-values (((uuid res)
+		    (add-z-card '("A A"))))
+	res))
+(test #f '("CC C++" "D 1486332719.0" "a A" "b C")(sort (construct-sdat 'b "C" 'a "A" 'CC "C++" 'D 1486332719.0)
+						       string<=?))
+(define pkt-example #f)
+(test #f "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84"
+      (let-values (((uuid res)
+		    (construct-pkt 'b "C" 'a "A" 'CC "C++" 'D 1486332719.0)))
+	(set! pkt-example (cons uuid res))
+	res))
+(test-end "basic packets")
+
+;;======================================================================
+;; Sqlite and postgresql based queue of pkts
+;;======================================================================
+
+(test-begin "pkt queue")
+(define db #f)
+(test #f 'sqlite3 (let ((dbh (open-queue-db "." "queue.db")))
+		    (set! db dbh)
+		    (dbi:db-dbtype dbh)))
+(test #f (cdr pkt-example)
+      (begin
+	(add-to-queue db (cdr pkt-example)(car pkt-example) 'basic #f 0)
+	(lookup-by-uuid db (car pkt-example) 0)))
+(test #f (cdr pkt-example)
+      (lookup-by-id db 1))
+(test #f 1 (length (find-pkts db '(basic) '())))
+
+(test-end "pkt queue")
+
+
+;;======================================================================
+;; Process groups of pkts
+;;======================================================================
+
+(test-begin "lists of packets")
+(test #f '((apkt . #f) (id . 1) (group-id . 2) (uuid . 3) (parent . 4) (pkt-type . 5)) ;; ((id . 1) (group-id . 2) (uuid . 3) (parent . 4) (pkt-type . 5))
+      (dblst->dpkts '(1 2 3 4 5)))
+(test #f '(((apkt (Z . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (b . "C") (a . "A") (D . "1486332719.0") (C . " C++")) (id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (processed . 0)))
+      ;; '(((apkt (Z . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (b . "C") (a . "A") (D . "1486332719.0") (C . " C++")) (id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+      ;; '(((id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+      (get-dpkts db '(basic) 0 #f))
+(test #f '(((apkt (Z . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (b . "C") (a . "A") (D . "1486332719.0") (C . " C++")) (id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (processed . 0)))
+      ;; '(((apkt (Z . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (b . "C") (a . "A") (D . "1486332719.0") (C . " C++")) (id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+      ;; '(((id . 1) (group-id . 0) (uuid . "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84") (parent . "") (pkt-type . "basic") (pkt . "CC C++\nD 1486332719.0\na A\nb C\nZ 263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+      (get-not-processed-pkts db 0 'basic 1000 0))
+(test-end "lists of packets")
+
+(test-begin "pkts as alists")
+(define pktspec '((posting . ((title . t)   ;; NOTE: same as ((posting (title . t)(url . u)(blub . b)) ... 
+			      (url   . u)
+			      (blurb . b)))
+		  (comment . ((comment . c)
+			      (score   . s)))
+		  (basic   . ((b-field . b)
+			      (a-field . a)))))
+(define pktlst (find-pkts db '(basic) '()))
+(define dpkt (car pktlst))
+(test #f "A" (get-value 'a-field dpkt pktspec))
+
+(test #f "C" (alist-ref 'b-field (dpkt->alist dpkt pktspec)))
+
+(define basic-spec '((nada (foo . b)(bar . f))(basic (foo . f)(bar . b))))
+(define test-pkt   '((foo . "fooval")(bar . "barval")))
+(let*-values (((u p)  (alist->pkt test-pkt basic-spec ptype: 'basic))
+		((apkt) (pkt->alist p))
+		((bpkt) (pkt->alist p pktspec: basic-spec)))
+    (test #f "fooval" (alist-ref 'f apkt))
+    (test #f "fooval" (alist-ref 'foo bpkt))
+    (test #f #f       (alist-ref 'f   bpkt)))
+
+(test-end "pkts as alists")
+
+(test-begin "descendents and ancestors")
+
+(define (get-uuid pkt)(alist-ref 'uuid pkt))
+
+;; add a child to 263e
+(let-values (((uuid pkt)
+	      (construct-pkt 'x "X" 'y "Y" 'P "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84"
+			     'D "1486332719.0")))
+  (add-to-queue db pkt uuid 'basic "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84" 0))
+
+(test #f '("263eb3b6193de7fe65b1ded5bcda513e8b4d6b84" "818fe30988c9673441b8f203972a8bda6af682f8")
+      (map (lambda (x)(alist-ref 'uuid x))
+	   (get-descendents
+	    db 0
+	    "263eb3b6193de7fe65b1ded5bcda513e8b4d6b84")))
+
+(test #f '("263eb3b6193de7fe65b1ded5bcda513e8b4d6b84" "818fe30988c9673441b8f203972a8bda6af682f8")
+      (map (lambda (x)(alist-ref 'uuid x))
+	   (get-ancestors
+	    db 0
+	    "818fe30988c9673441b8f203972a8bda6af682f8")))
+
+(test-end "descendents and ancestors")
+
+(test-end "pkts and pkt archives")
+
+(test-begin "pktsdb")
+
+(define spec '((tests (testname n TEXT)
+		      (testpath p TEXT)
+		      (duration d INTEGER))))
+;; (define pktsdb (make-pktdb))
+;; (pktdb-pktsdb-spec-set! pktsdb spec)
+
+(define pktsdb #f)
+
+(test #f #t (dbi:database? (let ((pdb (pktsdb-open "test.db" spec)))
+			     (set! pktsdb pdb)
+			     (pktdb-conn pdb))))
+;; (pp (pktdb-pktspec pktsdb))
+(test #f #t (pktsdb-add-record pktsdb 'tests '((testname . "test1"))))
+
+(pktsdb-close pktsdb)
+
+(test-end "pktsdb")

ADDED   attic/client.scm
Index: attic/client.scm
==================================================================
--- /dev/null
+++ attic/client.scm
@@ -0,0 +1,46 @@
+
+;; Copyright 2006-2012, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+;;======================================================================
+;; C L I E N T S
+;;======================================================================
+
+(declare (unit client))
+
+(declare (uses common))
+(declare (uses db))
+(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.
+(declare (uses commonmod))
+
+(use srfi-18 extras tcp s11n srfi-1 posix regex srfi-69 hostinfo md5
+     message-digest matchable spiffy uri-common intarweb http-client
+     spiffy-request-vars uri-common intarweb directory-utils)
+
+(import commonmod
+	debugprint)
+
+(module client
+*
+
+)
+
+(import client)
+
+(include "common_records.scm")
+(include "db_records.scm")
+

ADDED   attic/http-transport.scm
Index: attic/http-transport.scm
==================================================================
--- /dev/null
+++ attic/http-transport.scm
@@ -0,0 +1,708 @@
+
+;; Copyright 2006-2012, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+(declare (unit http-transport))
+
+(declare (uses common))
+(declare (uses debugprint))
+(declare (uses db))
+(declare (uses tests))
+(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.
+(declare (uses server))
+;; (declare (uses daemon))
+(declare (uses portlogger))
+(declare (uses rmt))
+(declare (uses dbfile))
+(declare (uses commonmod))
+(declare (uses mtargs))
+
+(module http-transport
+*
+
+
+(import srfi-1 posix regex regex-case srfi-69 hostinfo md5
+	message-digest posix-extras spiffy uri-common intarweb http-client
+	spiffy-request-vars intarweb spiffy-directory-listing
+	(srfi 18) extras tcp s11n)
+
+(import scheme
+	chicken
+	
+	(prefix mtargs args:)
+	debugprint)
+
+;; Configurations for server
+(tcp-buffer-size 2048)
+(max-connections 2048) 
+
+(include "common_records.scm")
+(include "db_records.scm")
+(include "js-path.scm")
+
+(import dbfile commonmod)
+
+(require-library stml)
+(define (http-transport:make-server-url hostport)
+  (if (not hostport)
+      #f
+      (conc "http://" (car hostport) ":" (cadr hostport))))
+
+(define *server-loop-heart-beat* (current-seconds))
+
+;;======================================================================
+;; S E R V E R
+;; ======================================================================
+
+;; Call this to start the actual server
+;;
+
+(define *db:process-queue-mutex* (make-mutex))
+
+(define (http-transport:run hostn)
+  ;; Configurations for server
+  (tcp-buffer-size 2048)
+  (max-connections 2048) 
+  (debug:print 2 *default-log-port* "Attempting to start the server ...")
+  (let* ((db              #f) ;;        (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily
+	 (hostname        (get-host-name))
+	 (ipaddrstr       (let ((ipstr (if (string=? "-" hostn)
+					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
+					   (server:get-best-guess-address hostname)
+					   #f)))
+			    (if ipstr ipstr hostn))) ;; hostname))) 
+	 (start-port      (portlogger:open-run-close portlogger:find-port))
+	 (link-tree-path  (common:get-linktree))
+	 (tmp-area        (common:get-db-tmp-area))
+	 (start-file      (conc tmp-area "/.server-start")))
+    (debug:print-info 0 *default-log-port* "portlogger recommended port: " start-port)
+    ;; set some parameters for the server
+    (root-path     (if link-tree-path 
+		       link-tree-path
+		       (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP!
+    (handle-directory spiffy-directory-listing)
+    (handle-exception (lambda (exn chain)
+			(signal (make-composite-condition
+				 (make-property-condition 
+				  'server
+				  'message "server error")))))
+
+    ;; http-transport:handle-directory) ;; simple-directory-handler)
+    ;; Setup the web server and a /ctrl interface
+    ;;
+    (vhost-map `(((* any) . ,(lambda (continue)
+			       ;; open the db on the first call 
+				 ;; This is were we set up the database connections
+			       (let* (($   (request-vars source: 'both))
+				      (dat ($ 'dat))
+				      (res #f))
+				 (cond
+				  ((equal? (uri-path (request-uri (current-request)))
+					   '(/ "api"))
+				   (send-response body:    (api:process-request *dbstruct-dbs* $) ;; the $ is the request vars proc
+						  headers: '((content-type text/plain)))
+				   (mutex-lock! *heartbeat-mutex*)
+				   (set! *db-last-access* (current-seconds))
+				   (mutex-unlock! *heartbeat-mutex*))
+				  ((equal? (uri-path (request-uri (current-request))) 
+					   '(/ ""))
+				   (send-response body: (http-transport:main-page)))
+				  ((equal? (uri-path (request-uri (current-request))) 
+					   '(/ "json_api"))
+				   (send-response body: (http-transport:main-page)))
+				  ((equal? (uri-path (request-uri (current-request))) 
+					   '(/ "runs"))
+				   (send-response body: (http-transport:main-page)))
+				  ((equal? (uri-path (request-uri (current-request))) 
+					   '(/ any))
+				   (send-response body: "hey there!\n"
+						  headers: '((content-type text/plain))))
+				  ((equal? (uri-path (request-uri (current-request))) 
+					   '(/ "hey"))
+				   (send-response body: "hey there!\n" 
+						  headers: '((content-type text/plain))))
+                                  ((equal? (uri-path (request-uri (current-request))) 
+					   '(/ "jquery3.1.0.js"))
+				   (send-response body: (http-transport:show-jquery) 
+						  headers: '((content-type application/javascript))))
+                                  ((equal? (uri-path (request-uri (current-request))) 
+					   '(/ "test_log"))
+				   (send-response body: (http-transport:html-test-log $) 
+						  headers: '((content-type text/HTML))))    
+                                  ((equal? (uri-path (request-uri (current-request))) 
+					   '(/ "dashboard"))
+				   (send-response body: (http-transport:html-dboard $) 
+						  headers: '((content-type text/HTML)))) 
+				  (else (continue))))))))
+    (handle-exceptions
+	exn
+      (debug:print 0 *default-log-port* "Failed to create file " start-file ", exn=" exn)
+      (with-output-to-file start-file (lambda ()(print (current-process-id)))))
+    (http-transport:try-start-server ipaddrstr start-port)))
+
+;; This is recursively run by http-transport:run until sucessful
+;;
+(define (http-transport:try-start-server ipaddrstr portnum)
+  (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))
+	(config-use-proxy (equal? (configf:lookup *configdat* "client" "use-http_proxy") "yes")))
+    (if (not config-use-proxy)
+	(determine-proxy (constantly #f)))
+    (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname)
+    (handle-exceptions
+	exn
+	(begin
+	  ;; (print-error-message exn)
+	  (if (< portnum 64000)
+	      (begin 
+		(debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...")
+		(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+		(debug:print 5 *default-log-port* "exn=" (condition->list exn))
+		(portlogger:open-run-close portlogger:set-failed portnum)
+		(debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port")
+		(thread-sleep! 0.1)
+		
+		;; get_next_port goes here
+		(http-transport:try-start-server ipaddrstr
+						 (portlogger:open-run-close portlogger:find-port)))
+	      (begin
+		(debug:print 0 *default-log-port* "ERROR: Tried and tried but could not start the server"))))
+      ;; any error in following steps will result in a retry
+      (set! *server-info* (list ipaddrstr portnum))
+      (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum)
+      ;; This starts the spiffy server
+      ;; NEED WAY TO SET IP TO #f TO BIND ALL
+      ;; (start-server bind-address: ipaddrstr port: portnum)
+      (if config-hostname ;; this is a hint to bind directly
+	  (start-server port: portnum) ;; bind-address: (if (equal? config-hostname "-")
+					;;		ipaddrstr
+					;;		config-hostname))
+	  (start-server port: portnum))
+      (portlogger:open-run-close portlogger:set-port portnum "released")
+      (debug:print 1 *default-log-port* "INFO: server has been stopped"))))
+
+;;======================================================================
+;; S E R V E R   U T I L I T I E S 
+;;======================================================================
+
+;;======================================================================
+;; C L I E N T S
+;;======================================================================
+
+(define *http-mutex* (make-mutex))
+
+;; NOTE: Large block of code from 32436b426188080f72fceb6894af541fbad9921e removed here
+;;       I'm pretty sure it is defunct.
+
+;; This next block all imported en-mass from the api branch
+(define *http-requests-in-progress* 0)
+(define *http-connections-next-cleanup* (current-seconds))
+
+(define (http-transport:get-time-to-cleanup)
+  (let ((res #f))
+    (mutex-lock! *http-mutex*)
+    (set! res (> (current-seconds) *http-connections-next-cleanup*))
+    (mutex-unlock! *http-mutex*)
+    res))
+
+(define (http-transport:inc-requests-count)
+  (mutex-lock! *http-mutex*)
+  (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*))
+  ;; Use this opportunity to slow things down iff there are too many requests in flight
+  (if (> *http-requests-in-progress* 5)
+      (begin
+	(debug:print-info 0 *default-log-port* "Whoa there buddy, ease up...")
+	(thread-sleep! 1)))
+  (mutex-unlock! *http-mutex*))
+
+(define (http-transport:dec-requests-count proc) 
+  (mutex-lock! *http-mutex*)
+  (proc)
+  (set! *http-requests-in-progress* (- *http-requests-in-progress* 1))
+  (mutex-unlock! *http-mutex*))
+
+(define (http-transport:dec-requests-count-and-close-all-connections)
+  (set! *http-requests-in-progress* (- *http-requests-in-progress* 1))
+  (let loop ((etime (+ (current-seconds) 5))) ;; give up in five seconds
+    (if (> *http-requests-in-progress* 0)
+	(if (> etime (current-seconds))
+	    (begin
+	      (thread-sleep! 0.05)
+	      (loop etime))
+	    (debug:print-error 0 *default-log-port* "requests still in progress after 5 seconds of waiting. I'm going to pass on cleaning up http connections"))
+	(close-all-connections!)))
+  (set! *http-connections-next-cleanup* (+ (current-seconds) 10))
+  (mutex-unlock! *http-mutex*))
+
+(define (http-transport:inc-requests-and-prep-to-close-all-connections)
+  (mutex-lock! *http-mutex*)
+  (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*)))
+
+;; Send "cmd" with json payload "params" to serverdat and receive result
+;;
+(define (http-transport:client-api-send-receive run-id runremote cmd params #!key (numretries 3))
+  (assert (remote? runremote) "FATAL: http-transport:client-api-send-receive called with serverdat="serverdat)
+  (let* ((fullurl    (remote-api-req runremote))
+	 (res        (vector #f "uninitialized"))
+	 (success    #t)
+	 (sparams    (db:obj->string params transport: 'http))
+         (server-id  (remote-server-id runremote)))
+       (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) 
+
+       ;; set up the http-client here
+       (max-retry-attempts 1)
+       ;; consider all requests indempotent
+       (retry-request? (lambda (request)
+			 #f))
+       ;; send the data and get the response
+       ;; extract the needed info from the http data and 
+       ;; process and return it.
+       (let* ((send-recieve (lambda ()
+			      (mutex-lock! *http-mutex*)
+			      ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines)
+			      ;;					       ((exn http client-error) e (print e)))
+			      (set! res (vector                ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time.
+					 success
+					 (db:string->obj 
+					  (handle-exceptions
+					      exn
+					      (let ((call-chain (get-call-chain))
+						    (msg        ((condition-property-accessor 'exn 'message) exn)))
+						(set! success #f)
+                                                (if (debug:debug-mode 1)
+                                                    (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...")
+                                                    (begin
+                                                      (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".")
+                                                      (debug:print 0 *default-log-port* " message: " msg ", exn=" exn)
+                                                      (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey"))
+                                                      (debug:print 0 *default-log-port* " call-chain: " call-chain)))
+						;; what if another thread is communicating ok? Can't happen due to mutex
+						(http-transport:close-connections runremote)
+						(mutex-unlock! *http-mutex*)
+						;; (close-connection! fullurl)
+						(db:obj->string #f))
+					      (with-input-from-request ;; was dat
+					       fullurl 
+					       (list (cons 'key (or server-id   "thekey"))
+						     (cons 'cmd cmd)
+						     (cons 'params sparams))
+					       read-string))
+					  transport: 'http)
+					 0)) ;; added this speculatively
+			      ;; Shouldn't this be a call to the managed call-all-connections stuff above?
+			      ;; (close-all-connections!) ;; BUG? WHY IS THIS HERE? Are we failing to reuse connections?
+			      (mutex-unlock! *http-mutex*)
+			      ))
+	      (time-out     (lambda ()
+			      (thread-sleep! 45)
+			      (debug:print 0 *default-log-port* "WARNING: send-receive took more than 45 seconds!!")
+			      #f))
+	      (th1 (make-thread send-recieve "with-input-from-request"))
+	      (th2 (make-thread time-out     "time out")))
+	 (thread-start! th1)
+	 (thread-start! th2)
+	 (thread-join! th1)
+          (vector-set! res 0 success)
+	 (thread-terminate! th2)
+	 (if (vector? res)
+	     (if (vector-ref res 0) ;; this is the first flag or the second flag? 
+                 (let* ((res-dat (vector-ref res 1)))
+                    (if (and (string? res-dat) (string-contains res-dat "server-id mismatch"))
+                     (signal (make-composite-condition
+		          (make-property-condition 
+		       'servermismatch
+		       'message  (vector-ref res 1))))       
+		      res)) ;; this is the *inner* vector? seriously? why?
+                 (if (debug:debug-mode 11)
+                     (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it
+                       (print-call-chain (current-error-port))
+                       (debug:print-error 11 *default-log-port* "error above occured at server, res=" res) ;; " message: " ((condition-property-accessor 'exn 'message) exn))
+                       (debug:print 11 *default-log-port* " server call chain:")
+                       (pp (vector-ref res 1) (current-error-port))
+                       (signal (vector-ref res 0)))
+                     res))
+	     (signal (make-composite-condition
+		      (make-property-condition 
+		       'timeout
+		       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))
+
+;; careful closing of connections stored in *runremote*
+;;
+(define (http-transport:close-connections runremote)
+  (if (remote? runremote)
+      (let ((api-dat (remote-api-uri runremote)))
+	(handle-exceptions
+	    exn
+	  (begin
+	    (print-call-chain *default-log-port*)
+	    (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
+	  (if (args:any-defined? "-server" "-execute" "-run")
+	      (debug:print-info 0 *default-log-port* "Closing connections to "api-dat))
+	  (if api-dat (close-connection! api-dat))
+
+	  ;; Would it be better to set *runremote* to #f? I don't think so. But we may
+	  ;; need to clear more of the runremote fields
+	  (remote-api-url-set! runremote #f) ;; used as a flag for connection up and running
+	  
+	  #t))
+      #f))
+
+;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
+;; used and to shutdown after sometime if it is not.
+;;
+(define (http-transport:keep-running) 
+  ;; if none running or if > 20 seconds since 
+  ;; server last used then start shutdown
+  ;; This thread waits for the server to come alive
+  (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server")
+  (let* ((servinfofile      #f)
+	 (sdat              #f)
+	 (no-sync-db        (db:open-no-sync-db))
+	 (tmp-area          (common:get-db-tmp-area))
+	 (started-file      (conc tmp-area "/.server-started"))
+	 (server-start-time (current-seconds))
+	 (server-info (let loop ((start-time (current-seconds))
+				 (changed    #t)
+				 (last-sdat  "not this"))
+                        (begin ;; let ((sdat #f))
+			  (thread-sleep! 0.01)
+			  (debug:print-info 0 *default-log-port* "Waiting for server alive signature")
+                          (mutex-lock! *heartbeat-mutex*)
+                          (set! sdat *server-info*)
+                          (mutex-unlock! *heartbeat-mutex*)
+                          (if (and sdat
+				   (not changed)
+				   (> (- (current-seconds) start-time) 2))
+			      (let* ((servinfodir (server:get-servinfo-dir *toppath*)) ;; (conc *toppath*"/.servinfo"))
+				     (ipaddr      (car sdat))
+				     (port        (cadr sdat))
+				     (servinf     (conc servinfodir"/"ipaddr":"port)))
+				(set! servinfofile servinf)
+				(if (not (file-exists? servinfodir))
+				    (create-directory servinfodir #t))
+				(with-output-to-file servinf
+				  (lambda ()
+				    (let* ((serv-id (server:mk-signature)))
+				      (set! *server-id* serv-id)
+				      (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id))
+				      (print "started: "(seconds->year-week/day-time (current-seconds))))))
+				(set! *on-exit-procs* (cons
+						       (lambda ()
+							 (delete-file* servinf))
+						       *on-exit-procs*))
+				;; put data about this server into a simple flat file host.port
+				(debug:print-info 0 *default-log-port* "Received server alive signature")
+				sdat)
+                              (begin
+				(debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
+                                (sleep 4)
+				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
+                                    (if sdat 
+				      (let* ((ipaddr  (car sdat))
+					   (port    (cadr sdat))
+					   (servinf (conc (server:get-servinfo-dir *toppath*)"/"ipaddr":"port)))
+				        (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
+				      (exit))
+                                      (exit)
+                                    )
+				    (loop start-time
+					  (equal? sdat last-sdat)
+					  sdat)))))))
+	 (iface       (car server-info))
+         (port        (cadr server-info))
+         (last-access 0)
+	 (server-timeout (server:expiration-timeout))
+	 (server-going  #f)
+	 (server-log-file (args:get-arg "-log"))) ;; always set when we are a server
+
+    (handle-exceptions
+	exn
+      (debug:print 0 *default-log-port* "Failed to create " started-file ", exn=" exn)
+      (with-output-to-file started-file (lambda ()(print (current-process-id)))))
+
+    (let loop ((count         0)
+	       (server-state 'available)
+	       (bad-sync-count 0)
+	       (start-time     (current-milliseconds)))
+
+      ;; Use this opportunity to sync the tmp db to megatest.db
+      (if (not server-going) ;; *dbstruct-dbs* 
+	  (begin
+	    (debug:print 0 *default-log-port* "SERVER: dbprep")
+	    (set! *dbstruct-dbs*  (db:setup #t)) ;;  run-id)) FIXME!!!
+	    (set! server-going #t)
+	    (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine.
+	  (if (and no-sync-db
+		   (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :)
+              (begin
+		(if (common:low-noise-print 120 "sync-all-print")
+                    (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S")))
+		(db:all-db-sync *dbstruct-dbs*)
+		)))
+      
+      ;; when things go wrong we don't want to be doing the various queries too often
+      ;; so we strive to run this stuff only every four seconds or so.
+      (let* ((sync-time (- (current-milliseconds) start-time))
+	    (rem-time  (quotient (- 4000 sync-time) 1000)))
+	(if (and (<= rem-time 4)
+		 (>  rem-time 0))
+	    (thread-sleep! rem-time)))
+      
+      (if (< count 1) ;; 3x3 = 9 secs aprox
+	  (loop (+ count 1) 'running bad-sync-count (current-milliseconds)))
+      
+      ;; Check that iface and port have not changed (can happen if server port collides)
+      (mutex-lock! *heartbeat-mutex*)
+      (set! sdat *server-info*)
+      (mutex-unlock! *heartbeat-mutex*)
+      
+      (if (not (equal? sdat (list iface port)))
+	  (let ((new-iface (car sdat))
+		(new-port  (cadr sdat)))
+	    (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info")
+	    (set! iface new-iface)
+	    (set! port  new-port)
+             (if (not *server-id*)
+		 (set! *server-id* (server:mk-signature)))
+	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
+	    (flush-output *default-log-port*)))
+      
+      ;; Transfer *db-last-access* to last-access to use in checking that we are still alive
+      (mutex-lock! *heartbeat-mutex*)
+      (set! last-access *db-last-access*)
+      (mutex-unlock! *heartbeat-mutex*)
+      
+      (if (common:low-noise-print 120 (conc "server running on " iface ":" port))
+	  (begin
+             (if (not *server-id*)
+		 (set! *server-id* (server:mk-signature)))
+             (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))   
+	     (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
+	     (flush-output *default-log-port*)))
+      (if (common:low-noise-print 60 "dbstats")
+	  (begin
+	    (debug:print 0 *default-log-port* "Server stats:")
+	    (db:print-current-query-stats)))
+      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600)))
+	(cond
+         ((and *server-run*
+	       (> (+ last-access server-timeout)
+		  (current-seconds)))
+          (if (common:low-noise-print 120 "server continuing")
+              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
+	      (let ((curr-time (current-seconds)))
+		(handle-exceptions
+		    exn
+		    (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn)
+		    (if (and ;; (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter
+			     (not *server-overloaded*)
+			     (file-exists? servinfofile))
+			(change-file-times servinfofile curr-time curr-time)))
+		(if (and (common:low-noise-print 120 "start new server")
+			(> *api-process-request-count* 50)) ;; if this server is kind of busy start up another
+		    (begin
+		      (debug:print-info 0 *default-log-port* "Server is busy, api-count "*api-process-request-count*", start another if possible...")
+		      (server:kind-run *toppath*)
+		      (if (> *api-process-request-count* 100)
+			  (begin
+			    (debug:print-info 0 *default-log-port* "Server is overloaded at api-count=" *api-process-request-count*", removing "servinfofile) 
+			    (delete-file* servinfofile)))))))
+          (loop 0 server-state bad-sync-count (current-milliseconds)))
+         (else
+          (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
+          (http-transport:server-shutdown port)))))))
+
+(define (http-transport:server-shutdown port)
+  (begin
+    ;;(BB> "http-transport:server-shutdown called")
+    (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
+    ;;
+    ;; start_shutdown
+    ;;
+    (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up
+    (portlogger:open-run-close portlogger:set-port port "released")
+    (thread-sleep! 1)
+
+    ;; (debug:print-info 0 *default-log-port* "Max cached queries was    " *max-cache-size*)
+    ;; (debug:print-info 0 *default-log-port* "Number of cached writes   " *number-of-writes*)
+    ;; (debug:print-info 0 *default-log-port* "Average cached write time "
+    ;; 		      (if (eq? *number-of-writes* 0)
+    ;; 			  "n/a (no writes)"
+    ;; 			  (/ *writes-total-delay*
+    ;; 			     *number-of-writes*))
+    ;; 		      " ms")
+    ;; (debug:print-info 0 *default-log-port* "Number non-cached queries "  *number-non-write-queries*)
+    ;; (debug:print-info 0 *default-log-port* "Average non-cached time   "
+    ;; 		      (if (eq? *number-non-write-queries* 0)
+    ;; 			  "n/a (no queries)"
+    ;; 			  (/ *total-non-write-delay* 
+    ;; 			     *number-non-write-queries*))
+    ;; 		      " ms")
+    
+    (db:print-current-query-stats)
+    #;(common:save-pkt `((action . exit)
+                       (T      . server)
+                       (pid    . ,(current-process-id)))
+    *configdat* #t)
+
+    ;; remove .servinfo file(s) here
+    
+    (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
+    (exit)))
+
+;; all routes though here end in exit ...
+;;
+;; start_server? 
+;;
+(define (http-transport:launch)
+  ;; check the .servinfo directory, are there other servers running on this
+  ;; or another host?
+  (let* ((server-start-is-ok  (server:minimal-check *toppath*)))
+    (if (not server-start-is-ok)
+	(begin
+	  (debug:print 0 *default-log-port* "ERROR: server start not ok, exiting now.")
+	  (exit 1))))
+    
+  ;; check that a server start is in progress, pause or exit if so
+  (let* ((th2 (make-thread (lambda ()
+                             (debug:print-info 0 *default-log-port* "Server run thread started")
+                             (http-transport:run 
+                              (if (args:get-arg "-server")
+                                  (args:get-arg "-server")
+                                  "-")
+                              )) "Server run"))
+         (th3 (make-thread (lambda ()
+                             (debug:print-info 0 *default-log-port* "Server monitor thread started")
+                             (http-transport:keep-running)
+                             "Keep running"))))
+    (thread-start! th2)
+    (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
+    (thread-start! th3)
+    (set! *didsomething* #t)
+    (thread-join! th2)
+    (exit)))
+
+;; (define (http-transport:server-signal-handler signum)
+;;   (signal-mask! signum)
+;;   (handle-exceptions
+;;    exn
+;;    (debug:print 0 *default-log-port* " ... exiting ...")
+;;    (let ((th1 (make-thread (lambda ()
+;; 			     (thread-sleep! 1))
+;; 			   "eat response"))
+;; 	 (th2 (make-thread (lambda ()
+;; 			     (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.")
+;; 			     (thread-sleep! 3) ;; give the flush three seconds to do it's stuff
+;; 			     (debug:print 0 *default-log-port* "       Done.")
+;; 			     (exit 4))
+;; 			   "exit on ^C timer")))
+;;      (thread-start! th2)
+;;      (thread-start! th1)
+;;      (thread-join! th2))))
+
+;;===============================================
+;; Java script
+;;===============================================
+(define (http-transport:show-jquery)
+  (let* ((data  (tests:readlines *java-script-lib*)))
+(string-join data "\n")))
+
+
+
+;;======================================================================
+;; web pages
+;;======================================================================
+
+(define (http-transport:html-test-log $)
+   (let* ((run-id ($ 'runid))
+         (test-item ($ 'testname))
+         (parts (string-split test-item ":"))
+         (test-name (car parts))
+             
+         (item-name (if (equal? (length parts) 1)
+             ""
+             (cadr parts))))
+  ;(print $) 
+(tests:get-test-log run-id test-name item-name)))
+
+
+(define (http-transport:html-dboard $)
+  (let* ((page ($ 'page))
+         (oup       (open-output-string)) 
+         (bdy "--------------------------")
+
+         (ret  (tests:dynamic-dboard page)))
+    (s:output-new  oup  ret)
+   (close-output-port oup)
+
+  (set! bdy   (get-output-string oup))
+     (conc "<h1>Dashboard</h1>" bdy "<br/> <br/> "  )))
+
+(define (http-transport:main-page)
+  (let ((linkpath (root-path)))
+    (conc "<head><h1>" (pathname-strip-directory *toppath*) "</h1></head>"
+	  "<body>"
+	  "Run area: " *toppath*
+	  "<h2>Server Stats</h2>"
+	  (http-transport:stats-table) 
+	  "<hr>"
+	  (http-transport:runs linkpath)
+	  "<hr>"
+	  ;; (http-transport:run-stats)
+	  "</body>"
+	  )))
+
+(define (http-transport:stats-table)
+  (mutex-lock! *heartbeat-mutex*)
+  (let ((res 
+	 (conc "<table>"
+	       ;; "<tr><td>Max cached queries</td>        <td>" *max-cache-size* "</td></tr>"
+	       "<tr><td>Number of cached writes</td>   <td>" *number-of-writes* "</td></tr>"
+	       "<tr><td>Average cached write time</td> <td>" (if (eq? *number-of-writes* 0)
+								 "n/a (no writes)"
+								 (/ *writes-total-delay*
+								    *number-of-writes*))
+	       " ms</td></tr>"
+	       "<tr><td>Number non-cached queries</td> <td>"  *number-non-write-queries* "</td></tr>"
+	       ;; "<tr><td>Average non-cached time</td>   <td>" (if (eq? *number-non-write-queries* 0)
+	       ;; 							 "n/a (no queries)"
+	       ;; 							 (/ *total-non-write-delay* 
+	       ;; 							    *number-non-write-queries*))
+	       " ms</td></tr>"
+	       "<tr><td>Last access</td><td>"              (seconds->time-string *db-last-access*) "</td></tr>"
+	       "</table>")))
+    (mutex-unlock! *heartbeat-mutex*)
+    res))
+
+(define (http-transport:runs linkpath)
+  (conc "<h3>Runs</h3>"
+	(string-intersperse
+	 (let ((files (map pathname-strip-directory (glob (conc linkpath "/*")))))
+	   (map (lambda (p)
+		  (conc "<a href=\"" p "\">" p "</a><br>"))
+		files))
+	 " ")))
+
+#;(define (http-transport:run-stats)
+  (let ((stats (open-run-close db:get-running-stats #f)))
+    (conc "<table>"
+	  (string-intersperse
+	   (map (lambda (stat)
+		  (conc "<tr><td>" (car stat) "</td><td>" (cadr stat) "</td></tr>"))
+		stats)
+	   " ")
+	  "</table>")))
+)

ADDED   attic/index-tree.scm
Index: attic/index-tree.scm
==================================================================
--- /dev/null
+++ attic/index-tree.scm
@@ -0,0 +1,61 @@
+;;======================================================================
+;; Copyright 2006-2013, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+;;
+;;======================================================================
+
+;;======================================================================
+;; Tests
+;;======================================================================
+
+(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking tcp directory-utils)
+(import (prefix sqlite3 sqlite3:))
+
+(declare (unit tests))
+(declare (uses lock-queue))
+(declare (uses db))
+(declare (uses common))
+(declare (uses items))
+(declare (uses runconfig))
+(declare (uses commonmod))
+(import commonmod)
+
+(include "common_records.scm")
+(include "key_records.scm")
+(include "db_records.scm")
+(include "run_records.scm")
+(include "test_records.scm")
+
+;; Populate the links tree with index.html files
+;;
+;;   - start from most recent tests and work towards oldest -OR-
+;;     start from deepest hierarchy and work way up
+;;   - look up tests in megatest.db
+;;   - cross-reference the tests to stats.db
+;;   - if newer than event_time in stats.db or not registered in stats.db regenerate
+;;   - run du and store in stats.db
+;;   - when all tests at that level done generate next level up index.html
+;; 
+;;     include in rollup html index.html:
+;;          sum of du
+;;          counts of PASS, FAIL, RUNNING, REMOTEHOSTSTART, LAUNCHED, CHECK etc.
+;;          overall status
+;;
+;;     include in test specific index.html:
+;;          host, uname, cpu graph, disk avail graph, steps, data
+;;          meta data, state, status, du
+;;          

ADDED   attic/lock-queue.scm
Index: attic/lock-queue.scm
==================================================================
--- /dev/null
+++ attic/lock-queue.scm
@@ -0,0 +1,258 @@
+;; Copyright 2006-2013, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+;;
+
+(use (prefix sqlite3 sqlite3:) srfi-18)
+
+(declare (unit lock-queue))
+(declare (uses common))
+(declare (uses debugprint))
+(declare (uses tasks))
+(declare (uses commonmod))
+
+(import commonmod
+	debugprint)
+
+;;======================================================================
+;; attempt to prevent overlapping updates of rollup files by queueing
+;; update requests in an sqlite db
+;;======================================================================
+
+;;======================================================================
+;; db record, <vector db path-to-db>
+;;======================================================================
+
+(define (make-lock-queue:db-dat)(make-vector 3))
+(define-inline (lock-queue:db-dat-get-db        vec)    (vector-ref  vec 0))
+(define-inline (lock-queue:db-dat-get-path      vec)    (vector-ref  vec 1))
+(define-inline (lock-queue:db-dat-set-db!       vec val)(vector-set! vec 0 val))
+(define-inline (lock-queue:db-dat-set-path!     vec val)(vector-set! vec 1 val))
+
+(define (lock-queue:delete-lock-db dbdat)
+  (let ((fname (lock-queue:db-dat-get-path dbdat)))
+    (system (conc "rm -f " fname "*"))))
+
+(define (lock-queue:open-db fname #!key (count 10))
+  (let* ((actualfname (conc fname ".lockdb"))
+	 (dbexists (common:file-exists? actualfname))
+	 (db       (sqlite3:open-database actualfname))
+	 (handler  (make-busy-timeout 136000)))
+    (if dbexists
+	(vector db actualfname)
+	(begin
+	  (handle-exceptions
+	   exn
+	   (begin
+	     (thread-sleep! 10)
+	     (if (> count 0)
+		 (lock-queue:open-db fname count: (- count 1))
+		 (vector db actualfname)))
+	   (sqlite3:with-transaction
+	    db
+	    (lambda ()
+	      (sqlite3:execute 
+	       db
+	       "CREATE TABLE IF NOT EXISTS queue (
+     	         id         INTEGER PRIMARY KEY,
+                 test_id    INTEGER,
+                 start_time INTEGER,
+                 state      TEXT,
+                 CONSTRAINT queue_constraint UNIQUE (test_id));")
+	      (sqlite3:execute
+	       db
+	       "CREATE TABLE IF NOT EXISTS runlocks (
+                 id         INTEGER PRIMARY KEY,
+                 test_id    INTEGER,
+                 run_lock   TEXT,
+                 CONSTRAINT runlock_constraint UNIQUE (run_lock));"))))))
+    (sqlite3:set-busy-handler! db handler)
+    (vector db actualfname)))
+
+(define (lock-queue:set-state dbdat test-id newstate #!key (remtries 10))
+  (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200)
+  (handle-exceptions
+   exn
+   (if (> remtries 0)
+       (begin
+	 (debug:print 0 *default-log-port* "WARNING: exception on lock-queue:set-state. Trying again in 30 seconds.")
+	 (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+	 (thread-sleep! 30)
+	 (lock-queue:set-state dbdat test-id newstate remtries: (- remtries 1)))
+       (begin
+	 (debug:print-error 0 *default-log-port* " Failed to set lock state for test with id " test-id ", error: " ((condition-property-accessor 'exn 'message) exn) ", giving up.")
+	 #f))
+   (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "UPDATE queue SET state=? WHERE test_id=?;"
+		    newstate
+		    test-id)))
+
+(define (lock-queue:any-younger? dbdat mystart test-id #!key (remtries 10))
+  ;; no need to wait on journal on read only queries
+  ;; (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200)
+  (handle-exceptions
+   exn
+   (if (> remtries 0)
+       (begin
+	 (debug:print 0 *default-log-port* "WARNING: exception on lock-queue:any-younger. Removing lockdb and trying again in 5 seconds.")
+	 (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+	 (thread-sleep! 5)
+         (lock-queue:delete-lock-db dbdat)
+	 (lock-queue:any-younger? dbdat mystart test-id remtries: (- remtries 1)))
+       (begin
+	 (debug:print-error 0 *default-log-port* " Failed to find younger locks for test with id " test-id ", error: " ((condition-property-accessor 'exn 'message) exn) ", giving up.")
+	 #f))
+   (let ((res #f))
+     (sqlite3:for-each-row
+      (lambda (tid)
+	;; Actually this should not be needed as mystart cannot be simultaneously less than and test-id same as 
+	(if (not (equal? tid test-id)) 
+	    (set! res tid)))
+      (lock-queue:db-dat-get-db dbdat)
+      "SELECT test_id FROM queue WHERE start_time > ?;" mystart)
+     res)))
+
+(define (lock-queue:get-lock dbdat test-id #!key (count 10)(waiting-msg #f))
+  (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 remove: #t waiting-msg: "lock-queue:get-lock, waiting on journal")
+  (let* ((res       #f)
+	 (db        (lock-queue:db-dat-get-db dbdat))
+	 (lckqry    (sqlite3:prepare db "SELECT test_id,run_lock FROM runlocks WHERE run_lock='locked';"))
+	 (mklckqry  (sqlite3:prepare db "INSERT INTO runlocks (test_id,run_lock) VALUES (?,'locked');")))
+    (let ((result 
+	   (handle-exceptions
+	    exn
+	    (begin
+	      (debug:print 0 *default-log-port* "WARNING: failed to get queue lock. Removing lock db and returning fail") ;; Will try again in a few seconds")
+	      (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+	      (thread-sleep! 10)
+	      ;; (if (> count 0)	
+	      ;;  #f ;; (lock-queue:get-lock dbdat test-id count: (- count 1)) - give up on retries 
+	      ;; (begin ;; never recovered, remote the lock file and return #f, no lock obtained
+	      (lock-queue:delete-lock-db dbdat)
+	      #f)
+	    (sqlite3:with-transaction
+	     db
+	     (lambda ()
+	       (sqlite3:for-each-row (lambda (tid lockstate)
+				       (set! res (list tid lockstate)))
+				     lckqry)
+	       (if res
+		   (if (equal? (car res) test-id)
+		       #t ;; already have the lock
+		       #f)
+		   (begin
+		     (sqlite3:execute mklckqry test-id)
+		     ;; if no error handled then return #t for got the lock
+		     #t)))))))
+      (sqlite3:finalize! lckqry)
+      (sqlite3:finalize! mklckqry)
+      result)))
+
+(define (lock-queue:release-lock fname test-id #!key (count 10))
+  (let* ((dbdat (lock-queue:open-db fname)))
+    (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 "lock-queue:release-lock; waiting on journal")
+    (handle-exceptions
+     exn
+     (begin
+       (debug:print 0 *default-log-port* "WARNING: Failed to release queue lock. Will try again in few seconds")
+       (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+       (thread-sleep! (/ count 10))
+       (if (> count 0)
+	   (begin
+	     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat))
+	     (lock-queue:release-lock fname test-id count: (- count 1)))
+	   (let ((journal (conc fname "-journal")))
+	     ;; If we've tried ten times and failed there is a serious problem
+	     ;; try to remove the lock db and allow it to be recreated
+	     (handle-exceptions
+	      exn
+	      #f
+	      (if (common:file-exists? journal)(delete-file journal))
+	      (if (common:file-exists? fname)  (delete-file fname))
+	      #f))))
+     (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "DELETE FROM runlocks WHERE test_id=?;" test-id)
+     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat)))))
+
+(define (lock-queue:steal-lock dbdat test-id #!key (count 10))
+  (debug:print-info 0 *default-log-port* "Attempting to steal lock at " (lock-queue:db-dat-get-path dbdat))
+  (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 "lock-queue:steal-lock; waiting on journal")
+  (handle-exceptions
+   exn
+   (begin
+     (debug:print 0 *default-log-port* "WARNING: Failed to steal queue lock. Will try again in few seconds")
+     (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+     (thread-sleep! 10)
+     (if (> count 0)
+	 (lock-queue:steal-lock dbdat test-id count: (- count 1))
+	 #f))
+   (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "DELETE FROM runlocks WHERE run_lock='locked';"))
+  (lock-queue:get-lock dbdat test-it))
+
+;; returns #f if ok to skip the task
+;; returns #t if ok to proceed with task
+;; otherwise waits
+;;
+(define (lock-queue:wait-turn fname test-id #!key (count 10)(waiting-msg #f))
+  (let* ((dbdat   (lock-queue:open-db fname))
+	 (mystart (current-seconds))
+	 (db      (lock-queue:db-dat-get-db dbdat)))
+    ;; (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 waiting-msg: "lock-queue:wait-turn; waiting on journal file")
+    (handle-exceptions
+     exn
+     (begin
+       (debug:print 0 *default-log-port* "WARNING: Failed to find out if it is ok to skip the wait queue. Will try again in few seconds")
+       (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+       (print-call-chain (current-error-port))
+       (thread-sleep! 10)
+       (if (> count 0)
+	   (begin
+	     (sqlite3:finalize! db)
+	     (lock-queue:wait-turn fname test-id count: (- count 1)))
+	   (begin
+	     (debug:print 0 *default-log-port* "Giving up calls to lock-queue:wait-turn for test-id " test-id " at path " fname ", printing call chain")
+	     (print-call-chain (current-error-port))
+	     #f)))
+     ;; wait 10 seconds and then check to see if someone is already updating the html
+     (thread-sleep! 10)
+     (if (not (lock-queue:any-younger? dbdat mystart test-id)) ;; no processing in flight, must try to start processing
+	 (begin
+	   (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 waiting-msg: "lock-queue:wait-turn; waiting on journal file")
+	   (sqlite3:execute
+	    db
+	    "INSERT OR REPLACE INTO queue (test_id,start_time,state) VALUES (?,?,'waiting');"
+	    test-id mystart)
+	   ;; (thread-sleep! 1) ;; give other tests a chance to register
+	   (let ((result 
+		  (let loop ((younger-waiting (lock-queue:any-younger? dbdat mystart test-id)))
+		    (if younger-waiting
+			(begin
+			  ;; no need for us to wait. mark in the lock queue db as skipping
+			  ;; no point in marking anything in the queue - simply never register this
+			  ;; test as it is *covered* by a previously started update to the html file
+			  ;; (lock-queue:set-state dbdat test-id "skipping")
+			  #f) ;; let the calling process know that nothing needs to be done
+			(if (lock-queue:get-lock dbdat test-id)
+			    #t
+			    (if (> (- (current-seconds) mystart) 36000) ;; waited too long, steal the lock
+				(lock-queue:steal-lock dbdat test-id)
+				(begin
+				  (thread-sleep! 1)
+				  (loop (lock-queue:any-younger? dbdat mystart test-id)))))))))
+	     (sqlite3:finalize! db)
+	     result))))))
+	  
+            
+;; (use trace)
+;; (trace lock-queue:get-lock lock-queue:release-lock lock-queue:wait-turn lock-queue:any-younger? lock-queue:set-state)

ADDED   attic/mlaunch.scm
Index: attic/mlaunch.scm
==================================================================
--- /dev/null
+++ attic/mlaunch.scm
@@ -0,0 +1,35 @@
+;; Copyright 2006-2014, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')
+
+;;======================================================================
+;; MLAUNCH
+;;
+;;   take jobs from the given queue and keep launching them keeping
+;;   the cpu load at the targeted level
+;;
+;;======================================================================
+
+(use sqlite3 srfi-1 posix regex regex-case srfi-69 format)
+
+(declare (unit mlaunch))
+(declare (uses db))
+(declare (uses common))
+(declare (uses commonmod))
+(import commonmod)
+

ADDED   attic/portlogger-example.scm
Index: attic/portlogger-example.scm
==================================================================
--- /dev/null
+++ attic/portlogger-example.scm
@@ -0,0 +1,21 @@
+;;  Copyright 2006-2017, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+
+(declare (uses portlogger))
+
+(print (apply portlogger:main (cdr (argv))))

ADDED   attic/synchash.scm
Index: attic/synchash.scm
==================================================================
--- /dev/null
+++ attic/synchash.scm
@@ -0,0 +1,137 @@
+;;======================================================================
+;; Copyright 2006-2012, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+;;
+;;======================================================================
+
+;;======================================================================
+;; A hash of hashes that can be kept in sync by sending minial deltas
+;;======================================================================
+
+(use format)
+(use srfi-1 srfi-69 sqlite3)
+(import (prefix sqlite3 sqlite3:))
+
+(declare (unit synchash))
+(declare (uses db))
+(declare (uses server))
+(declare (uses rmtmod))
+
+(include "db_records.scm")
+
+(import rmtmod)
+
+(define (synchash:make)
+   (make-hash-table))
+
+;; given an alist of objects '((id obj) ...) 
+;;   1. remove unchanged objects from the list
+;;   2. create a list of removed objects by id
+;;   3. remove removed objects from synchash
+;;   4. replace or add new or changed objects to synchash
+;;
+(define (synchash:get-delta indat synchash)
+  (let ((deleted '())
+	(changed '())
+	(found   '())
+	(orig-keys (hash-table-keys synchash)))
+    (for-each
+     (lambda (item)
+       (let* ((id  (car  item))
+	      (dat (cadr item))
+	      (ref (hash-table-ref/default synchash id #f)))
+	 (if (not (equal? dat ref)) ;; item changed or new
+	     (begin
+	       (set! changed (cons item changed))
+	       (hash-table-set! synchash id dat)))
+	 (set! found (cons id found))))
+     indat)
+    (for-each 
+     (lambda (id)
+       (if (not (member id found))
+	   (begin
+	     (set! deleted (cons id deleted))
+	     (hash-table-delete! synchash id))))
+     orig-keys)
+    (list changed deleted)
+    ;; (list indat '()) ;; just for debugging
+    ))
+    
+;; keynum => the field to use as the unique key (usually 0 but can be other field)
+;;
+(define (synchash:client-get proc synckey keynum synchash run-id . params)
+  (let* ((data   (rmt:synchash-get run-id proc synckey keynum params))
+	 (newdat (car data))
+	 (removs (cadr data))
+	 (myhash (hash-table-ref/default synchash synckey #f)))
+    (if (not myhash)
+	(begin
+	  (set! myhash (make-hash-table))
+	  (hash-table-set! synchash synckey myhash)))
+    (for-each 
+     (lambda (item)
+       (let ((id  (car item))
+	     (dat (cadr item)))
+	 ;; (debug:print-info 2 *default-log-port* "Processing item: " item)
+	 (hash-table-set! myhash id dat)))
+     newdat)
+    (for-each
+     (lambda (id)
+       (hash-table-delete! myhash id))
+     removs)
+    ;; WHICH ONE!?
+    ;; data)) ;; return the changed and deleted list
+    (list newdat removs))) ;; synchash))
+
+(define *synchashes* (make-hash-table))
+
+(define (synchash:server-get dbstruct run-id proc synckey keynum params)
+  ;; (debug:print-info 2 *default-log-port* "synckey: " synckey ", keynum: " keynum ", params: " params)
+  (let* ((dbdat     (db:get-db dbstruct run-id))
+	 (db        (db:dbdat-get-db dbdat))
+	 (synchash  (hash-table-ref/default *synchashes* synckey #f))
+	 (newdat    (apply (case proc
+			     ((db:get-runs)                   db:get-runs)
+			     ((db:get-tests-for-run-mindata)  db:get-tests-for-run-mindata)
+			     ((db:get-test-info-by-ids)       db:get-test-info-by-ids)
+			     (else
+			      (print "ERROR: sync for hash " proc " not setup! Edits needed in synchash.scm")
+			      print))
+			   db params))
+	 (postdat  #f)
+	 (make-indexed (lambda (x)
+			 (list (vector-ref x keynum) x))))
+    ;; Now process newdat based on the query type
+    (set! postdat (case proc
+		    ((db:get-runs)
+		     ;; (debug:print-info 2 *default-log-port* "Get runs call")
+		     (let ((header (vector-ref newdat 0))
+			   (data   (vector-ref newdat 1)))
+		       ;; (debug:print-info 2 *default-log-port* "header: " header ", data: " data)
+		       (cons (list "header" header)         ;; add the header keyed by the word "header"
+			     (map make-indexed data))))        ;; add each element keyed by the keynum'th val
+		    (else 
+		     ;; (debug:print-info 2 *default-log-port* "Non-get runs call")
+		     (map make-indexed newdat))))
+    ;; (debug:print-info 2 *default-log-port* "postdat: " postdat)
+    ;; (if (not indb)(sqlite3:finalize! db))
+    (if (not synchash)
+	(begin
+	  (set! synchash (make-hash-table))
+	  (hash-table-set! *synchashes* synckey synchash)))
+    (synchash:get-delta postdat synchash)))
+

DELETED client.scm
Index: client.scm
==================================================================
--- client.scm
+++ /dev/null
@@ -1,128 +0,0 @@
-
-;; Copyright 2006-2012, Matthew Welland.
-;; 
-;; This file is part of Megatest.
-;; 
-;;     Megatest is free software: you can redistribute it and/or modify
-;;     it under the terms of the GNU General Public License as published by
-;;     the Free Software Foundation, either version 3 of the License, or
-;;     (at your option) any later version.
-;; 
-;;     Megatest is distributed in the hope that it will be useful,
-;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
-;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;;     GNU General Public License for more details.
-;; 
-;;     You should have received a copy of the GNU General Public License
-;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
-
-;;======================================================================
-;; C L I E N T S
-;;======================================================================
-
-(use srfi-18 extras tcp s11n srfi-1 posix regex srfi-69 hostinfo md5
-     message-digest matchable spiffy uri-common intarweb http-client
-     spiffy-request-vars uri-common intarweb directory-utils)
-
-(declare (unit client))
-
-(declare (uses common))
-(declare (uses db))
-(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.
-
-(include "common_records.scm")
-(include "db_records.scm")
-
-;; client:get-signature
-(define (client:get-signature)
-  (if *my-client-signature* *my-client-signature*
-      (let ((sig (conc (get-host-name) " " (current-process-id))))
-	(set! *my-client-signature* sig)
-	*my-client-signature*)))
-
-;; Not currently used! But, I think it *should* be used!!!
-#;(define (client:logout serverdat)
-  (let ((ok (and (socket? serverdat)
-		 (cdb:logout serverdat *toppath* (client:get-signature)))))
-    ok))
-
-(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0))
-  (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects))
-
-;; Do all the connection work, look up the transport type and set up the
-;; connection if required.
-;;
-;; There are two scenarios. 
-;;   1. We are a test manager and we received *transport-type* and *runremote* via cmdline
-;;   2. We are a run tests, list runs or other interactive process and we must figure out
-;;      *transport-type* and *runremote* from the monitor.db
-;;
-;; client:setup
-;;
-;; lookup_server, need to remove *runremote* stuff
-;;
- 
-(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f))
-  (mutex-lock! *rmt-mutex*)
-  (let ((res (client:setup-http-baby areapath remaining-tries: remaining-tries failed-connects: failed-connects area-dat: area-dat)))
-    (mutex-unlock! *rmt-mutex*)
-    res))
-
-(define (client:setup-http-baby areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f))
-  (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries)
-  (server:start-and-wait areapath)
-  (if (<= remaining-tries 0)
-      (begin
-	(debug:print-error 0 *default-log-port* "failed to start or connect to server")
-	(exit 1))
-      ;;
-      ;; Alternatively here, we can get the list of candidate servers and work our way
-      ;; through them searching for a good one.
-      ;;
-      (let* ((server-dat (server:choose-server areapath 'best))
-	     (runremote  (or area-dat *runremote*)))
-	(if (not server-dat) ;; no server found
-	    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
-	    (match server-dat
-	      ((host port start-time server-id pid)
-	       (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
-	       (if (and (not area-dat)
-			(not *runremote*))
-                   (begin       
-		     (set! *runremote* (make-remote))
-                     (let* ((server-info (remote-server-info *runremote*))) 
-                       (if server-info
-                           (begin
-                             (remote-server-url-set! *runremote* (server:record->url server-info))
-                             (remote-server-id-set! *runremote* (server:record->id server-info)))))))
-	       (if (and host port server-id)
-		   (let* ((start-res (http-transport:client-connect host port server-id))
-			  (ping-res  (rmt:login-no-auto-client-setup start-res)))
-		     (if (and start-res
-			      ping-res)
-			 (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago
-			   (if runremote
-			       (begin
-				 (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res)
-				 (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
-				 start-res)
-			       (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))
-			 (begin    ;; login failed but have a server record, clean out the record and try again
-			   (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid.  Fixes part of Randy;s ticket 1405717332
-			   (case *transport-type* 
-			     ((http)(http-transport:close-connections)))
-                           (if *runremote* 
-			       (remote-conndat-set! runremote #f)  ;; (hash-table-delete! runremote run-id)
-                               )
-			   (thread-sleep! 1)
-			   (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
-			   )))
-		   (begin    ;; no server registered
-		     ;; (server:kind-run areapath)
-		     (server:start-and-wait areapath)
-		     (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
-		     (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
-		     (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))))
-	      (else
-	       (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat)))))))
-

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -15,23 +15,32 @@
 ;; 
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
+
+(declare (unit common))
+(declare (uses commonmod))
+(declare (uses rmtmod))
+(declare (uses debugprint))
+(declare (uses mtargs))
 
 (use srfi-1 data-structures posix regex-case (prefix base64 base64:)
      format dot-locking csv-xml z3 udp ;; sql-de-lite
      hostinfo md5 message-digest typed-records directory-utils stack
      matchable regex posix (srfi 18) extras ;; tcp 
      (prefix nanomsg nmsg:)
      (prefix sqlite3 sqlite3:)
      pkts (prefix dbi dbi:)
      )
+(use posix-extras pathname-expand files)
 
-(declare (unit common))
-(declare (uses commonmod))
-(import commonmod)
+
+(import commonmod
+	debugprint
+	rmtmod
+	(prefix mtargs args:))
 
 (include "common_records.scm")
 
 
 ;; (require-library margs)
@@ -47,12 +56,14 @@
 (define (stop-the-train)
   (thread-start! (make-thread (lambda ()
 				(let loop ()
 				  (if (and *toppath*
 					   (file-exists? (conc *toppath*"/stop-the-train")))
-				      (begin
-					(debug:print 0 *default-log-port* "ERROR: found file "*toppath*"/stop-the-train, exiting immediately")
+				      (let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately")))
+					;; yes, print to current-output-port AND *default-log-port*, annoying but necessary I think
+					(print msg)
+					(debug:print 0 *default-log-port* msg)
 					(exit 1)))
 				  (thread-sleep! 5)
 				  (loop))))))
 
 ;; execute thunk, return value.  If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* .
@@ -161,19 +172,16 @@
 ;; (define *db-multi-sync-mutex* (make-mutex))      ;; protect access to *db-sync-in-progress*, *db-last-sync*
 ;; task db
 (define *task-db*             #f) ;; (vector db path-to-db)
 (define *db-access-allowed*   #t) ;; flag to allow access
 ;; (define *db-access-mutex*     (make-mutex)) ;; moved to dbfile
-(define *db-transaction-mutex* (make-mutex))
+;; (define *db-transaction-mutex* (make-mutex))
 (define *db-cache-path*       #f)
 ;; (define *db-with-db-mutex*    (make-mutex))
 (define *db-api-call-time*    (make-hash-table)) ;; hash of command => (list of times)
-;; no sync db
-;; (define *no-sync-db*          #f)  ;; moved to dbfile
 
 ;; SERVER
-(define *my-client-signature* #f)
 (define *transport-type*    'http)             ;; override with [server] transport http|rpc|nmsg
 (define *runremote*         #f)                ;; if set up for server communication this will hold <host port>
 ;; (define *max-cache-size*    0)
 (define *logged-in-clients* (make-hash-table))
 (define *server-id*         #f)
@@ -210,12 +218,10 @@
 
 ;; Miscellaneous
 (define *triggers-mutex*     (make-mutex))     ;; block overlapping processing of triggers
 (define *numcpus-cache* (make-hash-table))
 
-(use posix-extras pathname-expand files)
-
 ;; this plugs a hole in posix-extras in recent chicken versions > 4.9)
 (let-values (( (chicken-release-number chicken-major-version)
                (apply values
                       (map string->number
                            (take
@@ -246,36 +252,13 @@
 (define *common:this-exe-dir*      (pathname-directory *common:this-exe-fullpath*))
 (define *common:this-exe-name*     (pathname-strip-directory *common:this-exe-fullpath*))
 
 (define (common:get-sync-lock-filepath)
   (let* ((tmp-area     (common:get-db-tmp-area))
-         (lockfile     (conc tmp-area "/megatest.db.sync-lock")))
+         (lockfile     (conc tmp-area "/megatest.db.lock")))
     lockfile))
 
-;;======================================================================
-;; when called from a wrapper I need sometimes to find the calling
-;; wrapper, this is for dashboard to find the correct megatest.
-;;
-(define (common:find-local-megatest #!optional (progname "megatest"))
-  (let ((res (filter file-exists?
-		     (map (lambda (updir)
-			    (let* ((lm  (car (argv)))
-				   (dir (pathname-directory lm))
-				   (exe (pathname-strip-directory lm)))
-			      (conc (if dir (conc dir "/") "")
-				    (case (string->symbol exe)
-				      ((dboard)    (conc updir progname))
-				      ((mtest)     (conc updir progname))
-				      ((dashboard) progname)
-				      (else exe)))))
-			  '("../../" "../")))))
-    (if (null? res)
-	(begin
-	  (debug:print 0 *default-log-port* "Failed to find this executable! Using what can be found on the path")
-	  progname)
-	(car res))))
-
 (define *common:logpro-exit-code->status-sym-alist*
   '( ( 0 . pass )
      ( 1 . fail )
      ( 2 . warn )
      ( 3 . check )
@@ -315,26 +298,39 @@
     (else "FAIL")))
 
 (define (common:logpro-exit-code->test-status exit-code)
   (status-sym->string (common:logpro-exit-code->status-sym exit-code)))
 
+;; 
 (defstruct remote
+
+  ;; transport to be used
+  ;; http              - use http-transport
+  ;; http-read-cached  - use http-transport for writes but in-mem cached for reads
+  (rmode            'http)
   (hh-dat            (let ((res (or (server:choose-server *toppath* 'homehost)
 				    (cons #f #f))))
 		       (assert (pair? res)(conc "FATAL: hh-dat should be a pair, got "res))
 		       res))
   (server-url        #f) ;; (server:check-if-running *toppath*) #f))
   (server-id         #f)
-  (server-info       (if *toppath* (server:check-if-running *toppath*) #f))
+  (server-info       #f) ;; (if *toppath* (server:check-if-running *toppath*) #f))
   (last-server-check 0)  ;; last time we checked to see if the server was alive
-  (connect-time      (current-seconds))
-  (conndat           #f)
-  (transport         *transport-type*)
+  (connect-time      (current-seconds)) ;; when we first connected
+  (last-access       (current-seconds)) ;; last time we talked to server
+  ;; (conndat           #f) ;; iface port api-uri api-url api-req seconds server-id
   (server-timeout    (server:expiration-timeout))
   (force-server      #f)
   (ro-mode           #f)  
-  (ro-mode-checked   #f)) ;; flag that indicates we have checked for ro-mode
+  (ro-mode-checked   #f) ;; flag that indicates we have checked for ro-mode
+
+  ;; conndat stuff
+  (iface             #f) ;; TODO: Consolidate this data with server-url and server-info above
+  (port              #f)
+  (api-url           #f)
+  (api-uri           #f)
+  (api-req           #f))
 
 ;; launching and hosts
 (defstruct host
   (reachable    #f)
   (last-update  0)
@@ -408,27 +404,48 @@
 
 (define (common:version-changed?)
   (not (equal? (common:get-last-run-version)
                (common:version-signature))))
 
+
+;; From 1.70 to 1.80, db's are compatible.
+
 (define (common:api-changed?)
-  (not (equal? (substring (->string megatest-version) 0 4)
-               (substring (conc (common:get-last-run-version)) 0 4))))
+  (let* (
+    (megatest-major-version (substring (->string megatest-version) 0 4))
+    (run-major-version (substring (conc (common:get-last-run-version)) 0 4))
+   )
+   (and (not (equal? megatest-major-version "1.80"))
+     (not (equal? megatest-major-version megatest-run-version)))
+  )
+)
 
 ;;======================================================================
 ;; Move me elsewhere ...
 ;; RADT => Why do we meed the version check here, this is called only if version misma
 ;;
 (define (common:cleanup-db dbstruct #!key (full #f))
-  (apply db:multi-db-sync 
-   dbstruct
-   'schema
-   'killservers
-   'adj-target
-   'new2old
-   '(dejunk)
-  )
+  (case (rmt:transport-mode)
+    ((http)
+     (apply db:multi-db-sync 
+	    dbstruct
+	    'schema
+	    'killservers
+	    'adj-target
+	    'new2old
+	    '(dejunk)
+	    ))
+    ((tcp nfs)
+     (debug:print 0 *default-log-port* "WARNING: cleanup-db NOT implemented yet for tcp and nfs.")
+     #;(apply db:multi-db-sync 
+	    dbstruct
+	    'schema
+	    'killservers
+	    'adj-target
+	    'new2old
+	    '(dejunk)
+	    )))
   (if (common:api-changed?)
       (common:set-last-run-version)))
 
 (define (common:snapshot-file filepath #!key (subdir  ".") )
   (if (file-exists? filepath)
@@ -520,11 +537,11 @@
 (define (common:rotate-logs)
   (let* ((all-files (make-hash-table))
 	 (stats     (make-hash-table))
 	 (inc-stat  (lambda (key)
 		      (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1))))
-	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
+	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age
     (if (not (directory-exists? "logs"))(create-directory "logs"))
     (directory-fold 
      (lambda (file rem)
        (handle-exceptions
 	exn
@@ -599,14 +616,15 @@
 ;;======================================================================
 ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
 ;; Do NOT check if not on homehost!
 ;;
 (define (common:exit-on-version-changed)
-  (if (common:on-homehost?)
+  (if (and *toppath*              ;; do nothing if *toppath* not yet provided
+	   (common:on-homehost?))
       (if (common:api-changed?)
 	  (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config"))
-                 (dbfile  (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.db"))
+                 (dbfile  (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db"))
                  (read-only (not (file-write-access? dbfile)))
                  (dbstruct (db:setup #t))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;;  #t)))
 	    (debug:print 0 *default-log-port*
 			 "WARNING: Version mismatch!\n"
 			 "   expected: " (common:version-signature) "\n"
@@ -626,14 +644,14 @@
                (common:cleanup-db dbstruct)))
              ((not (common:file-exists? mtconf))
               (debug:print 0 *default-log-port* "   megatest.config does not exist in this area.  Cannot proceed with megatest version migration.")
               (exit 1))
              ((not (common:file-exists? dbfile))
-              (debug:print 0 *default-log-port* "   megatest.db does not exist in this area.  Cannot proceed with megatest version migration.")
+              (debug:print 0 *default-log-port* "   .mtdb/main.db does not exist in this area.  Cannot proceed with megatest version migration.")
               (exit 1))
              ((not (eq? (current-user-id)(file-owner mtconf)))
-              (debug:print 0 *default-log-port* "   You do not own megatest.db in this area.  Cannot proceed with megatest version migration.")
+              (debug:print 0 *default-log-port* "   You do not own .mtdb/main.db in this area.  Cannot proceed with megatest version migration.")
               (exit 1))
              (read-only
               (debug:print 0 *default-log-port* "   You have read-only access to this area.  Cannot proceed with megatest version migration.")
               (exit 1))
              (else
@@ -710,22 +728,21 @@
 	""))))
 
 (define (common:alist-ref/default key alist default)
   (or (alist-ref key alist) default))
 
-(define (common:low-noise-print waitval . keys)
-  (let* ((key      (string-intersperse (map conc keys) "-" ))
-	 (lasttime (hash-table-ref/default *common:denoise* key 0))
-	 (currtime (current-seconds)))
-    (if (> (- currtime lasttime) waitval)
-	(begin
-	  (hash-table-set! *common:denoise* key currtime)
-	  #t)
-	#f)))
-
-(define (common:get-megatest-exe)
-  (or (getenv "MT_MEGATEST") "megatest"))
+;; moved into commonmod
+;;
+;; (define (common:low-noise-print waitval . keys)
+;;   (let* ((key      (string-intersperse (map conc keys) "-" ))
+;; 	 (lasttime (hash-table-ref/default *common:denoise* key 0))
+;; 	 (currtime (current-seconds)))
+;;     (if (> (- currtime lasttime) waitval)
+;; 	(begin
+;; 	  (hash-table-set! *common:denoise* key currtime)
+;; 	  #t)
+;; 	#f)))
 
 (define (common:read-encoded-string instr)
   (handle-exceptions
    exn
    (handle-exceptions
@@ -946,49 +963,45 @@
 	  (handle-exceptions
 	      exn
 	      (begin
 		(debug:print-error 0 *default-log-port* "Couldn't create path to " *db-cache-path* ", exn=" exn)
 		(exit 1))
-	      (let* ((tsname (common:get-testsuite-name))
+	      (let* ((toppath (common:real-path *toppath*))
+		     (tsname (common:get-testsuite-name))
 		     (dbpath (common:get-create-writeable-dir
 			      (list (conc "/tmp/" (current-user-name)
 					  "/megatest_localdb/"
 					  tsname "/"
-					  (string-translate *toppath* "/" "."))
+					  (string-translate toppath "/" "."))
 				    (conc "/tmp/" (current-process-id) ;; just in case we have an issue with the dir by own user name
 					  "/megatest_localdb/"
 					  tsname
-					  (string-translate *toppath* "/" "."))
+					  (string-translate toppath "/" "."))
 				    ))))
 		(set! *db-cache-path* dbpath)
-		;; ensure megatest area has .megatest
-		(let ((dbarea (conc *toppath* "/.megatest")))
+		;; ensure megatest area has .mtdb
+		(let ((dbarea (conc *toppath* "/.mtdb")))
 		  (if (not (file-exists? dbarea))
 		      (create-directory dbarea)))
-		;; ensure tmp area has .megatest
-		(let ((dbarea (conc dbpath "/.megatest")))
+		;; ensure tmp area has .mtdb
+		(let ((dbarea (conc dbpath "/.mtdb")))
 		  (if (not (file-exists? dbarea))
 		      (create-directory dbarea)))
 		dbpath))
 	  #f)))
 
 (define (common:get-area-path-signature)
   (message-digest-string (md5-primitive) *toppath*))
 
-(define (common:get-signature str)
-  (message-digest-string (md5-primitive) str))
-
 ;;======================================================================
 ;; E X I T   H A N D L I N G
 ;;======================================================================
 
 (define (common:run-sync?)
-    (and (common:on-homehost?)
-	 (args:get-arg "-server")))
-
-(define (common:human-time)
-  (time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S"))
+  (and *toppath*               ;; gate if called before *toppath* is set
+       (common:on-homehost?)
+       (args:get-arg "-server")))
 
 
 (define (std-signal-handler signum)
   ;; (signal-mask! signum)
   (set! *time-to-exit* #t) 
@@ -1048,26 +1061,10 @@
 (define (common:get-disks #!key (configf #f))
   (hash-table-ref/default 
    (or configf (read-config "megatest.config" #f #t))
    "disks" '("none" "")))
 
-;;======================================================================
-;; return first command that exists, else #f
-;;
-(define (common:which cmds)
-  (if (null? cmds)
-      #f
-      (let loop ((hed (car cmds))
-		 (tal (cdr cmds)))
-	(let ((res (with-input-from-pipe (conc "which " hed) read-line)))
-	  (if (and (string? res)
-		   (common:file-exists? res))
-	      res
-	      (if (null? tal)
-		  #f
-		  (loop (car tal)(cdr tal))))))))
-  
 (define (common:get-install-area)
   (let ((exe-path (car (argv))))
     (if (common:file-exists? exe-path)
 	(handle-exceptions
 	 exn
@@ -1345,11 +1342,11 @@
 			  (else
 			   (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.")
 			   #t)))) ;; default to requiring server
     (if force-result
 	(begin
-	  (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".")
+	  (debug:print-info 0 *default-log-port* "ATTENTION! Forcing use of server, force setting is \"" force-setting "\".")
 	  #t)
 	#f)))
 
 ;;======================================================================
 ;; M I S C   L I S T S
@@ -1598,10 +1595,30 @@
 	path) ;; just give up
     (with-input-from-pipe
 	(conc "/bin/readlink -f " path)
       (lambda ()
 	(read-line)))))
+
+;; for reasons I don't understand multiple calls to real-path in parallel threads
+;; must be protected by mutexes
+;;
+(define (common:real-path inpath)
+  ;; (process:cmd-run-with-stderr->list "readlink" "-f" inpath)) ;; cmd . params)
+  ;; (let-values 
+  ;;  (((inp oup pid) (process "readlink" (list "-f" inpath))))
+  ;;  (with-input-from-port inp
+  ;;    (let loop ((inl (read-line))
+  ;;       	(res #f))
+  ;;      (print "inl=" inl)
+  ;;      (if (eof-object? inl)
+  ;;          (begin
+  ;;            (close-input-port inp)
+  ;;            (close-output-port oup)
+  ;;            ;; (process-wait pid)
+  ;;            res)
+  ;;          (loop (read-line) inl))))))
+  (with-input-from-pipe (conc "readlink -f " inpath) read-line))
 
 ;;======================================================================
 ;; returns *effective load* (not normalized)
 ;;
 (define (common:get-intercept onemin fivemin)
@@ -1797,25 +1814,22 @@
 	   (> (length res) 2))
       res)
      ((eq? res #f)   default) ;; add messages?
      ((eq? res #f) default)   ;; this would be the #eof
      (else default))))
+
+(define (common:ssh-get-loadavg remote-host)
+  (let ((inp (open-input-pipe (conc "ssh " remote-host " \"cat /proc/loadavg;cat /proc/cpuinfo;echo end\""))))
+      (let* ((res (read-lines inp)))
+	(close-input-pipe inp)
+	res)))
 
 (define (common:get-normalized-cpu-load-raw remote-host)
   (let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost
     (or (common:get-cached-info actual-host "normalized-load")
 	(let ((data (if remote-host
-			(let ((inp #f))
-			  (handle-exceptions
-			      exn
-			    (begin
-			      (close-input-port inp)
-			      '())
-			    (set! inp (open-input-port (conc "ssh " remote-host " \"cat /proc/loadavg;cat /proc/cpuinfo;echo end\"")))
-			    (let* ((res (read-lines inp)))
-			      (close-input-port inp)
-			      res)))
+			(common:ssh-get-loadavg remote-host)
 			(append 
 			 (with-input-from-file "/proc/loadavg" 
 			   read-lines)
 			 (with-input-from-file "/proc/cpuinfo"
 			   read-lines)
@@ -1989,16 +2003,27 @@
 		    (host-last-used-set! rec curr-time)
 		    new-best)
 		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
 
 (define (common:wait-for-homehost-load maxnormload msg)
-  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
-                     #f
-                     (server:choose-server *toppath* 'homehost)))
-         (hh     (if hh-dat (car hh-dat) #f)))
-    (common:wait-for-normalized-load maxnormload msg hh)))
-
+  (let loop ((start-time (current-seconds))) ;; we saw some instances of this being called before *toppath* was set. This might be an early setup race. This delay should help but it is impossible to test...
+    (if (not *toppath*)
+	(begin
+	  (debug:print 0 *default-log-port* "ERROR: common:wait-for-homehost-load called before *toppath* set.")
+	  (thread-sleep! 30)
+	  (if (< (- (current-seconds) start-time) 300)
+	      (loop start-time)))))
+  (case (rmt:transport-mode)
+    ((http)
+     (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
+			#f
+			(server:choose-server *toppath* 'homehost)))
+            (hh     (if hh-dat (car hh-dat) #f)))
+       (common:wait-for-normalized-load maxnormload msg hh)))
+    (else
+     (common:wait-for-normalized-load maxnormload msg (get-host-name)))))
+    
 (define (common:get-num-cpus remote-host)
   (let* ((actual-host (or remote-host (get-host-name))))
     ;; hosts had better not be changing the number of cpus too often!
     (or (hash-table-ref/default *numcpus-cache* actual-host #f)
 	(let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600)))
@@ -2023,27 +2048,10 @@
 				  (common:write-cached-info actual-host "num-cpus" result))
 			      result))))
 	  (hash-table-set! *numcpus-cache* actual-host numcpus)
 	  numcpus))))
 
-(define (common:generic-ssh ssh-command proc default #!optional (msg-proc #f))
-  (let ((inp #f))
-    (handle-exceptions
-	exn
-      (begin
-	(close-input-port inp)
-	(if msg-proc
-	    (msg-proc)
-	    (debug:print 0 *default-log-port* "Command: \""ssh-command"\" failed. exn="exn))
-	default)
-      (set! inp (open-input-pipe ssh-command))
-      (with-input-from-port inp
-	(lambda ()
-	  (let ((res (proc)))
-	    (close-input-port inp)
-	    res))))))
-
 ;;======================================================================
 ;; wait for normalized cpu load to drop below maxload
 ;;
 (define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5))
   (let ((num-cpus (common:get-num-cpus remote-host)))
@@ -2218,30 +2226,10 @@
 	 (uname #f))
     (if (null? (car uname-res))
 	"unknown"
 	(caar uname-res))))
 
-;; for reasons I don't understand multiple calls to real-path in parallel threads
-;; must be protected by mutexes
-;;
-(define (common:real-path inpath)
-  ;; (process:cmd-run-with-stderr->list "readlink" "-f" inpath)) ;; cmd . params)
-  ;; (let-values 
-  ;;  (((inp oup pid) (process "readlink" (list "-f" inpath))))
-  ;;  (with-input-from-port inp
-  ;;    (let loop ((inl (read-line))
-  ;;       	(res #f))
-  ;;      (print "inl=" inl)
-  ;;      (if (eof-object? inl)
-  ;;          (begin
-  ;;            (close-input-port inp)
-  ;;            (close-output-port oup)
-  ;;            ;; (process-wait pid)
-  ;;            res)
-  ;;          (loop (read-line) inl))))))
-  (with-input-from-pipe (conc "readlink -f " inpath) read-line))
-
 ;;======================================================================
 ;; D I S K   S P A C E 
 ;;======================================================================
 
 (define (common:get-disk-space-used fpath)
@@ -2619,291 +2607,10 @@
     (cond
      (with-vars     (common:without-vars  fullcmd))
      (with-orig-env (common:with-orig-env fullcmd))
      (else          (common:without-vars  fullcmd "MT_.*")))))
 		  
-;;======================================================================
-;; T I M E   A N D   D A T E
-;;======================================================================
-
-;;======================================================================
-;; Convert strings like "5s 2h 3m" => 60x60x2 + 3x60 + 5
-(define (common:hms-string->seconds tstr)
-  (let ((parts     (string-split-fields "\\w+" tstr))
-	(time-secs 0)
-	;; s=seconds, m=minutes, h=hours, d=days, M=months, y=years, w=weeks
-	(trx       (regexp "(\\d+)([smhdMyw])")))
-    (for-each (lambda (part)
-		(let ((match  (string-match trx part)))
-		  (if match
-		      (let ((val (string->number (cadr match)))
-			    (unt (caddr match)))
-			(if val 
-			    (set! time-secs (+ time-secs (* val
-							    (case (string->symbol unt)
-							      ((s) 1)
-							      ((m) 60) ;; minutes
-							      ((h) 3600)
-							      ((d) 86400)
-							      ((w) 604800)
-							      ((M) 2628000) ;; aproximately one month
-							      ((y) 31536000)
-							      (else #f))))))))))
-	      parts)
-    time-secs))
-		       
-(define (seconds->hr-min-sec secs)
-  (let* ((hrs (quotient secs 3600))
-	 (min (quotient (- secs (* hrs 3600)) 60))
-	 (sec (- secs (* hrs 3600)(* min 60))))
-    (conc (if (> hrs 0)(conc hrs "hr ") "")
-	  (if (> min 0)(conc min "m ")  "")
-	  sec "s")))
-
-(define (seconds->time-string sec)
-  (time->string 
-   (seconds->local-time sec) "%H:%M:%S"))
-
-(define (seconds->work-week/day-time sec)
-  (time->string
-   (seconds->local-time sec) "ww%V.%u %H:%M"))
-
-(define (seconds->work-week/day sec)
-  (time->string
-   (seconds->local-time sec) "ww%V.%u"))
-
-(define (seconds->year-work-week/day sec)
-  (time->string
-   (seconds->local-time sec) "%yww%V.%w"))
-
-(define (seconds->year-work-week/day-time sec)
-  (time->string
-   (seconds->local-time sec) "%Yww%V.%w %H:%M"))
-
-(define (seconds->year-week/day-time sec)
-  (time->string
-   (seconds->local-time sec) "%Yw%V.%w %H:%M"))
-
-(define (seconds->quarter sec)
-  (case (string->number
-	 (time->string 
-	  (seconds->local-time sec)
-	  "%m"))
-    ((1 2 3) 1)
-    ((4 5 6) 2)
-    ((7 8 9) 3)
-    ((10 11 12) 4)
-    (else #f)))
-
-;;======================================================================
-;; basic ISO8601 format (e.g. "2017-02-28 06:02:54") date time => Unix epoch
-;;
-(define (common:date-time->seconds datetime)
-  (local-time->seconds (string->time datetime "%Y-%m-%d %H:%M:%S")))
-
-;;======================================================================
-;; given span of seconds tstart to tend
-;; find start time to mark and mark delta
-;;
-(define (common:find-start-mark-and-mark-delta tstart tend)
-  (let* ((deltat   (- (max tend (+ tend 10)) tstart)) ;; can't handle runs of less than 4 seconds. Pad it to 10 seconds ...
-	 (result   #f)
-	 (min      60)
-	 (hr       (* 60 60))
-	 (day      (* 24 hr))
-	 (yr       (* 365 day)) ;; year
-	 (mo       (/ yr 12))
-	 (wk       (* day 7)))
-    (for-each
-     (lambda (max-blks)
-       (for-each
-	(lambda (span) ;; 5 2 1
-	  (if (not result)
-	      (for-each 
-	       (lambda (timeunit timesym) ;; year month day hr min sec
-		 (if (not result)
-		     (let* ((time-blk (* span timeunit))
-			    (num-blks (quotient deltat time-blk)))
-		       (if (and (> num-blks 4)(< num-blks max-blks))
-			   (let ((first (* (quotient tstart time-blk) time-blk)))
-			     (set! result (list span timeunit time-blk first timesym))
-			     )))))
-	       (list yr mo wk day hr min 1)
-	       '(     y  mo w  d   h  m   s))))
-	(list 8 6 5 2 1)))
-     '(5 10 15 20 30 40 50 500))
-    (if values
-	(apply values result)
-	(values 0 day 1 0 'd))))
-
-;;======================================================================
-;; given x y lim return the cron expansion
-;;
-(define (common:expand-cron-slash x y lim)
-  (let loop ((curr x)
-	     (res  `()))
-    (if (< curr lim)
-	(loop (+ curr y) (cons curr res))
-	(reverse res))))
-
-;;======================================================================
-;; expand a complex cron string to a list of cron strings
-;;
-;;  x/y   => x, x+y, x+2y, x+3y while x+Ny<max_for_field
-;;  a,b,c => a, b ,c
-;;
-;;   NOTE: with flatten a lot of the crud below can be factored down.
-;;
-(define (common:cron-expand cron-str)
-  (if (list? cron-str)
-      (flatten
-       (fold (lambda (x res)
-	       (if (list? x)
-		   (let ((newres (map common:cron-expand x)))
-		     (append x newres))
-		   (cons x res)))
-	     '()
-	     cron-str)) ;; (map common:cron-expand cron-str))
-      (let ((cron-items (string-split cron-str))
-	    (slash-rx   (regexp "(\\d+)/(\\d+)"))
-	    (comma-rx   (regexp ".*,.*"))
-	    (max-vals   '((min        . 60)
-			  (hour       . 24)
-			  (dayofmonth . 28) ;;; BUG!!!! This will be a bug for some combinations
-			  (month      . 12)
-			  (dayofweek  . 7))))
-	(if (< (length cron-items) 5) ;; bad spec
-	    cron-str ;; `(,cron-str)              ;; just return the string, something downstream will fix it
-	    (let loop ((hed  (car cron-items))
-		       (tal  (cdr cron-items))
-		       (type 'min)
-		       (type-tal '(hour dayofmonth month dayofweek))
-		       (res  '()))
-	      (regex-case
-		  hed
-		(slash-rx ( _ base incr ) (let* ((basen          (string->number base))
-						 (incrn          (string->number incr))
-						 (expanded-vals  (common:expand-cron-slash basen incrn (alist-ref type max-vals)))
-						 (new-list-crons (fold (lambda (x myres)
-									 (cons (conc (if (null? res)
-											 ""
-											 (conc (string-intersperse res " ") " "))
-										     x " " (string-intersperse tal " "))
-									       myres))
-								       '() expanded-vals)))
-					    ;; (print "new-list-crons: " new-list-crons)
-					    ;; (fold (lambda (x res)
-					    ;; 	    (if (list? x)
-					    ;; 		(let ((newres (map common:cron-expand x)))
-					    ;; 		  (append x newres))
-					    ;; 		(cons x res)))
-					    ;; 	  '()
-					    (flatten (map common:cron-expand new-list-crons))))
-		;;					    (map common:cron-expand (map common:cron-expand new-list-crons))))
-		(else (if (null? tal)
-			  cron-str
-			  (loop (car tal)(cdr tal)(car type-tal)(cdr type-tal)(append res (list hed)))))))))))
-
-;;======================================================================
-;; given a cron string and the last time event was processed return #t to run or #f to not run
-;;
-;;  min    hour   dayofmonth month  dayofweek
-;; 0-59    0-23   1-31       1-12   0-6          ### NOTE: dayofweek does not include 7
-;;
-;;  #t => yes, run the job
-;;  #f => no, do not run the job
-;;
-(define (common:cron-event cron-str now-seconds-in last-done) ;; ref-seconds = #f is NOW.
-  (let* ((cron-items     (map string->number (string-split cron-str)))
-	 (now-seconds    (or now-seconds-in (current-seconds)))
-	 (now-time       (seconds->local-time now-seconds))
-	 (last-done-time (seconds->local-time last-done))
-	 (all-times      (make-hash-table)))
-    ;; (print "cron-items: " cron-items "(length cron-items): " (length cron-items))
-    (if (not (eq? (length cron-items) 5)) ;; don't even try to figure out junk strings
-	#f
-	(match-let (((     cmin chour cdayofmonth cmonth    cdayofweek)
-		     cron-items)
-		    ;; 0     1    2        3         4    5      6
-		    ((nsec nmin nhour ndayofmonth nmonth nyr ndayofweek n7 n8 n9)
-		     (vector->list now-time))
-		    ((lsec lmin lhour ldayofmonth lmonth lyr ldayofweek l7 l8 l9)
-		     (vector->list last-done-time)))
-	  ;; create all possible time slots
-	  ;; remove invalid slots due to (for example) day of week
-	  ;; get the start and end entries for the ref-seconds (current) time
-	  ;; if last-done > ref-seconds => this is an ERROR!
-	  ;; does the last-done time fall in the legit region?
-	  ;;    yes => #f  do not run again this command
-	  ;;    no  => #t  ok to run the command
-	  (for-each ;; month
-	   (lambda (month)
-	     (for-each ;; dayofmonth
-	      (lambda (dom)
-		(for-each
-		 (lambda (hr) ;; hour
-		   (for-each
-		    (lambda (minute) ;; minute
-		      (let ((copy-now (apply vector (vector->list now-time))))
-			(vector-set! copy-now 0 0) ;; force seconds to zero
-			(vector-set! copy-now 1 minute)
-			(vector-set! copy-now 2 hr)
-			(vector-set! copy-now 3 dom)  ;; dom is already corrected for zero referenced
-			(vector-set! copy-now 4 month)
-			(let* ((copy-now-secs (local-time->seconds copy-now))
-			       (new-copy      (seconds->local-time copy-now-secs))) ;; remake the time vector
-			  (if (or (not cdayofweek)
-				  (equal? (vector-ref new-copy 6)
-					  cdayofweek)) ;; if the day is specified and a match OR if the day is NOT specified
-			      (if (or (not cdayofmonth)
-				      (equal? (vector-ref new-copy 3)
-					      (+ 1 cdayofmonth))) ;; if the month is specified and a match OR if the month is NOT specified
-				  (hash-table-set! all-times copy-now-secs new-copy))))))
-		    (if cmin
-			`(,cmin)  ;; if given cmin, have to use it
-			(list (- nmin 1) nmin (+ nmin 1))))) ;; minute
-		 (if chour
-		     `(,chour)
-		     (list (- nhour 1) nhour (+ nhour 1))))) ;; hour
-	      (if cdayofmonth
-		  `(,cdayofmonth)
-		  (list (- ndayofmonth 1) ndayofmonth (+ ndayofmonth 1)))))
-	   (if cmonth
-	       `(,cmonth)
-	       (list (- nmonth 1) nmonth (+ nmonth 1))))
-	  (let ((before #f)
-		(is-in  #f))
-	    (for-each
-	     (lambda (moment)
-	       (if (and before
-			(<= before now-seconds)
-			(>= moment now-seconds))
-		   (begin
-		     ;; (print)
-		     ;; (print "Before: " (time->string (seconds->local-time before)))
-		     ;; (print "Now:    " (time->string (seconds->local-time now-seconds)))
-		     ;; (print "After:  " (time->string (seconds->local-time moment)))
-		     ;; (print "Last:   " (time->string (seconds->local-time last-done)))
-		     (if (<  last-done before)
-			 (set! is-in before))
-		     ))
-	       (set! before moment))
-	     (sort (hash-table-keys all-times) <))
-	    is-in)))))
-
-(define (common:extended-cron  cron-str now-seconds-in last-done)
-  (let ((expanded-cron (common:cron-expand cron-str)))
-    (if (string? expanded-cron)
-	(common:cron-event expanded-cron now-seconds-in last-done)
-	(let loop ((hed (car expanded-cron))
-		   (tal (cdr expanded-cron)))
-	  (if (common:cron-event hed now-seconds-in last-done)
-	      #t
-	      (if (null? tal)
-		  #f
-		  (loop (car tal)(cdr tal))))))))
-
 ;;======================================================================
 ;; C O L O R S
 ;;======================================================================
       
 (define (common:name->iup-color name)
@@ -3139,49 +2846,10 @@
 		      (if (null? tal)
 			  fallback-launcher
 			  (loop (car tal)(cdr tal))))))))
 	fallback-launcher)))
 
-;;======================================================================
-;; NMSG AND NEW API
-;;======================================================================
-;; 
-;; ;;======================================================================
-;; ;; nm based server experiment, keep around for now.
-;; ;;
-;; (define (nm:start-server dbconn #!key (given-host-name #f))
-;;   (let* ((srvdat    (start-raw-server given-host-name: given-host-name))
-;; 	 (host-name (srvdat-host srvdat))
-;; 	 (soc       (srvdat-soc srvdat)))
-;;     
-;;     ;; start the queue processor (save for second round of development)
-;;     ;;
-;;     (thread-start! (make-thread! (lambda ()(queue-processor dbconn) "Queue processor")))
-;;     ;; msg is an alist
-;;     ;;  'r host:port  <== where to return the data
-;;     ;;  'p params     <== data to apply the command to
-;;     ;;  'e j|s|l      <== encoding of the params. default is s (sexp), if not specified is assumed to be default
-;;     ;;  'c command    <== look up the function to call using this key
-;;     ;;
-;;     (let loop ((msg-in (nn-recv soc)))
-;;       (if (not (equal? msg-in "quit"))
-;; 	  (let* ((dat        (decode msg-in))
-;; 		 (host-port  (alist-ref 'r dat)) ;; this is for the reverse req rep where the server is a client of the original client
-;; 		 (params     (alist-ref 'p dat))
-;; 		 (command    (let ((c (alist-ref 'c dat)))(if c (string->symbol c) #f)))
-;; 		 (all-good   (and host-port params command (hash-table-exists? *commands* command))))
-;; 	    (if all-good
-;; 		(let ((cmddat (make-qitem
-;; 			       command:   command
-;; 			       host-port: host-port
-;; 			       params:    params)))
-;; 		  (queue-push cmddat) 		;; put request into the queue
-;; 		  (nn-send soc "queued"))         ;; reply with "queued"
-;; 		(print "ERROR: ["(common:human-time)"] BAD request " dat))
-;; 	    (loop (nn-recv soc)))))
-;;     (nn-close soc)))
-
 ;;======================================================================
 ;; D A S H B O A R D   U S E R   V I E W S
 ;;======================================================================
 
 ;;======================================================================
@@ -3347,11 +3015,17 @@
 	  (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not a directory."))
 	 ((not (file-read-access? pktsdir))
 	  (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not readable."))
 	 (else
 	  (debug:print-info 0 *default-log-port* "Loading packets found in " pktsdir)
-	  (let ((pkts (glob (conc pktsdir "/*.pkt"))))
+	  (let ((pkts (glob (conc pktsdir "/*.pkt")))
+                (sqdb (dbi:db-conn pdb))
+                )
+           ;; Put this in a transaction to avoid issues overloading the db
+           (sqlite3:with-transaction
+            sqdb
+            (lambda ()
 	    (for-each
 	     (lambda (pkt)
 	       (let* ((uuid    (cadr (string-match ".*/([0-9a-f]+).pkt" pkt)))
 		      (exists  (lookup-by-uuid pdb uuid #f)))
 		 (if (not exists)
@@ -3362,11 +3036,11 @@
 			    (ptype  (alist-ref 'T apkt)))
 		       (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0)
 		       (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue"))
 		     (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...")
 		     )))
-	     pkts)))))
+	     pkts)))))))
       pktsdirs))
    use-lt: use-lt))
 
 (define (common:get-pkt-alists pkts)
   (map (lambda (x)

Index: common_records.scm
==================================================================
--- common_records.scm
+++ common_records.scm
@@ -81,133 +81,133 @@
 
 ;; this was cached based on results from profiling but it turned out the profiling
 ;; somehow went wrong - perhaps too many processes writing to it. Leaving the caching
 ;; in for now but can probably take it out later.
 ;;
-(define (debug:calc-verbosity vstr)
-  (or (hash-table-ref/default *verbosity-cache* vstr #f)
-      (let ((res (cond
-                  ((number? vstr) vstr)
-                  ((not (string?  vstr))   1)
-                  ;; ((string-match  "^\\s*$" vstr) 1)
-                  (vstr           (let ((debugvals  (filter number? (map string->number (string-split vstr ",")))))
-                                    (cond
-                                     ((> (length debugvals) 1) debugvals)
-                                     ((> (length debugvals) 0)(car debugvals))
-                                     (else 1))))
-                  ((args:get-arg "-v")   2)
-                  ((args:get-arg "-q")    0)
-                  (else                   1))))
-        (hash-table-set! *verbosity-cache* vstr res)
-        res)))
-
-;; check verbosity, #t is ok
-(define (debug:check-verbosity verbosity vstr)
-  (if (not (or (number? verbosity)
-	       (list?   verbosity)))
-      (begin
-	(print "ERROR: Invalid debug value \"" vstr "\"")
-	#f)
-      #t))
-
-(define (debug:debug-mode n)
-  (cond
-   ((and (number? *verbosity*)   ;; number number
-	 (number? n))
-    (<= n *verbosity*))
-   ((and (list? *verbosity*)     ;; list   number
-	 (number? n))
-    (member n *verbosity*))
-   ((and (list? *verbosity*)     ;; list   list
-	 (list? n))
-    (not (null? (lset-intersection! eq? *verbosity* n))))
-   ((and (number? *verbosity*)
-	 (list? n))
-    (member *verbosity* n))))
-
-(define (debug:setup)
-  (let ((debugstr (or (args:get-arg "-debug")
-		      (args:get-arg "-debug-noprop")
-		      (getenv "MT_DEBUG_MODE"))))
-    (set! *verbosity* (debug:calc-verbosity debugstr))
-    (debug:check-verbosity *verbosity* debugstr)
-    ;; if we were handed a bad verbosity rule then we will override it with 1 and continue
-    (if (not *verbosity*)(set! *verbosity* 1))
-    (if (and (not (args:get-arg "-debug-noprop"))
-	     (or (args:get-arg "-debug")
-		 (not (getenv "MT_DEBUG_MODE"))))
-	(setenv "MT_DEBUG_MODE" (if (list? *verbosity*)
-				    (string-intersperse (map conc *verbosity*) ",")
-				    (conc *verbosity*))))))
-  
-(define (debug:print n e . params)
-  (if (debug:debug-mode n)
-      (with-output-to-port (or e (current-error-port))
-	(lambda ()
-	  (if *logging*
-	      (db:log-event (apply conc params))
-	      (apply print params)
-	      )))))
+;; (define (debug:calc-verbosity vstr)
+;;   (or (hash-table-ref/default *verbosity-cache* vstr #f)
+;;       (let ((res (cond
+;;                   ((number? vstr) vstr)
+;;                   ((not (string?  vstr))   1)
+;;                   ;; ((string-match  "^\\s*$" vstr) 1)
+;;                   (vstr           (let ((debugvals  (filter number? (map string->number (string-split vstr ",")))))
+;;                                     (cond
+;;                                      ((> (length debugvals) 1) debugvals)
+;;                                      ((> (length debugvals) 0)(car debugvals))
+;;                                      (else 1))))
+;;                   ((args:get-arg "-v")   2)
+;;                   ((args:get-arg "-q")    0)
+;;                   (else                   1))))
+;;         (hash-table-set! *verbosity-cache* vstr res)
+;;         res)))
+
+;; ;; check verbosity, #t is ok
+;; (define (debug:check-verbosity verbosity vstr)
+;;   (if (not (or (number? verbosity)
+;; 	       (list?   verbosity)))
+;;       (begin
+;; 	(print "ERROR: Invalid debug value \"" vstr "\"")
+;; 	#f)
+;;       #t))
+;; 
+;; (define (debug:debug-mode n)
+;;   (cond
+;;    ((and (number? *verbosity*)   ;; number number
+;; 	 (number? n))
+;;     (<= n *verbosity*))
+;;    ((and (list? *verbosity*)     ;; list   number
+;; 	 (number? n))
+;;     (member n *verbosity*))
+;;    ((and (list? *verbosity*)     ;; list   list
+;; 	 (list? n))
+;;     (not (null? (lset-intersection! eq? *verbosity* n))))
+;;    ((and (number? *verbosity*)
+;; 	 (list? n))
+;;     (member *verbosity* n))))
+;; 
+;; (define (debug:setup)
+;;   (let ((debugstr (or (args:get-arg "-debug")
+;; 		      (args:get-arg "-debug-noprop")
+;; 		      (getenv "MT_DEBUG_MODE"))))
+;;     (set! *verbosity* (debug:calc-verbosity debugstr))
+;;     (debug:check-verbosity *verbosity* debugstr)
+;;     ;; if we were handed a bad verbosity rule then we will override it with 1 and continue
+;;     (if (not *verbosity*)(set! *verbosity* 1))
+;;     (if (and (not (args:get-arg "-debug-noprop"))
+;; 	     (or (args:get-arg "-debug")
+;; 		 (not (getenv "MT_DEBUG_MODE"))))
+;; 	(setenv "MT_DEBUG_MODE" (if (list? *verbosity*)
+;; 				    (string-intersperse (map conc *verbosity*) ",")
+;; 				    (conc *verbosity*))))))
+;;   
+;; (define (debug:print n e . params)
+;;   (if (debug:debug-mode n)
+;;       (with-output-to-port (or e (current-error-port))
+;; 	(lambda ()
+;; 	  (if *logging*
+;; 	      (db:log-event (apply conc params))
+;; 	      (apply print params)
+;; 	      )))))
 
 ;; Brandon's debug printer shortcut (indulge me :)
-(define *BB-process-starttime* (current-milliseconds))
-(define (BB> . in-args)
-  (let* ((stack (get-call-chain))
-         (location "??"))
-    (for-each
-     (lambda (frame)
-       (let* ((this-loc (vector-ref frame 0))
-              (temp     (string-split (->string this-loc) " "))
-              (this-func (if (and (list? temp) (> (length temp) 1)) (cadr temp) "???")))
-         (if (equal? this-func "BB>")
-             (set! location this-loc))))
-     stack)
-    (let* ((color-on "\x1b[1m")
-           (color-off "\x1b[0m")
-           (dp-args
-            (append
-             (list 0 *default-log-port*
-                   (conc color-on location "@"(/ (- (current-milliseconds) *BB-process-starttime*) 1000) color-off "   ")  )
-             in-args)))
-      (apply debug:print dp-args))))
-
-(define *BBpp_custom_expanders_list* (make-hash-table))
-
-
-
-;; register hash tables with BBpp.
-(hash-table-set! *BBpp_custom_expanders_list* HASH_TABLE:
-                 (cons hash-table? hash-table->alist))
-
-;; test name converter
-(define (BBpp_custom_converter arg)
-  (let ((res #f))
-    (for-each
-     (lambda (custom-type-name)
-       (let* ((custom-type-info      (hash-table-ref *BBpp_custom_expanders_list* custom-type-name))
-              (custom-type-test      (car custom-type-info))
-              (custom-type-converter (cdr custom-type-info)))
-         (when (and (not res) (custom-type-test arg))
-           (set! res (custom-type-converter arg)))))
-     (hash-table-keys *BBpp_custom_expanders_list*))
-    (if res (BBpp_ res) arg)))
-
-(define (BBpp_ arg)
-  (cond
-   ;;((SOMESTRUCT? arg) (cons SOMESTRUCT: (SOMESTRUCT->alist arg)))
-   ;;((dboard:tabdat? arg) (cons dboard:tabdat: (dboard:tabdat->alist arg)))
-   ((hash-table? arg)
-    (let ((al (hash-table->alist arg)))
-      (BBpp_ (cons HASH_TABLE: al))))
-   ((null? arg) '())
-   ;;((list? arg) (cons (BBpp_ (car arg)) (BBpp_ (cdr arg))))
-   ((pair? arg) (cons (BBpp_ (car arg)) (BBpp_ (cdr arg))))
-   (else (BBpp_custom_converter arg))))
-
-;; Brandon's pretty printer.  It expands hashes and custom types in addition to regular pp
-(define (BBpp arg)
-  (pp (BBpp_ arg)))
+;; (define *BB-process-starttime* (current-milliseconds))
+;; (define (BB> . in-args)
+;;   (let* ((stack (get-call-chain))
+;;          (location "??"))
+;;     (for-each
+;;      (lambda (frame)
+;;        (let* ((this-loc (vector-ref frame 0))
+;;               (temp     (string-split (->string this-loc) " "))
+;;               (this-func (if (and (list? temp) (> (length temp) 1)) (cadr temp) "???")))
+;;          (if (equal? this-func "BB>")
+;;              (set! location this-loc))))
+;;      stack)
+;;     (let* ((color-on "\x1b[1m")
+;;            (color-off "\x1b[0m")
+;;            (dp-args
+;;             (append
+;;              (list 0 *default-log-port*
+;;                    (conc color-on location "@"(/ (- (current-milliseconds) *BB-process-starttime*) 1000) color-off "   ")  )
+;;              in-args)))
+;;       (apply debug:print dp-args))))
+;; 
+;; (define *BBpp_custom_expanders_list* (make-hash-table))
+;; 
+;; 
+;; 
+;; ;; register hash tables with BBpp.
+;; (hash-table-set! *BBpp_custom_expanders_list* HASH_TABLE:
+;;                  (cons hash-table? hash-table->alist))
+;; 
+;; ;; test name converter
+;; (define (BBpp_custom_converter arg)
+;;   (let ((res #f))
+;;     (for-each
+;;      (lambda (custom-type-name)
+;;        (let* ((custom-type-info      (hash-table-ref *BBpp_custom_expanders_list* custom-type-name))
+;;               (custom-type-test      (car custom-type-info))
+;;               (custom-type-converter (cdr custom-type-info)))
+;;          (when (and (not res) (custom-type-test arg))
+;;            (set! res (custom-type-converter arg)))))
+;;      (hash-table-keys *BBpp_custom_expanders_list*))
+;;     (if res (BBpp_ res) arg)))
+;; 
+;; (define (BBpp_ arg)
+;;   (cond
+;;    ;;((SOMESTRUCT? arg) (cons SOMESTRUCT: (SOMESTRUCT->alist arg)))
+;;    ;;((dboard:tabdat? arg) (cons dboard:tabdat: (dboard:tabdat->alist arg)))
+;;    ((hash-table? arg)
+;;     (let ((al (hash-table->alist arg)))
+;;       (BBpp_ (cons HASH_TABLE: al))))
+;;    ((null? arg) '())
+;;    ;;((list? arg) (cons (BBpp_ (car arg)) (BBpp_ (cdr arg))))
+;;    ((pair? arg) (cons (BBpp_ (car arg)) (BBpp_ (cdr arg))))
+;;    (else (BBpp_custom_converter arg))))
+;; 
+;; ;; Brandon's pretty printer.  It expands hashes and custom types in addition to regular pp
+;; (define (BBpp arg)
+;;   (pp (BBpp_ arg)))
 
 ;(use define-macro)
 (define-syntax inspect
   (syntax-rules ()
     [(_ x)
@@ -215,40 +215,40 @@
        (printf "~a is: ~a\n" 'x (with-output-to-string (lambda () (BBpp x))))
      ;;  )
      ]
     [(_ x y ...) (begin (inspect x) (inspect y ...))]))
 
-(define (debug:print-error n e . params)
-  ;; normal print
-  (if (debug:debug-mode n)
-      (with-output-to-port (if (port? e) e (current-error-port))
-	(lambda ()
-	  (if *logging*
-	      (db:log-event (apply conc params))
-	      ;; (apply print "pid:" (current-process-id) " " params)
-	      (apply print "ERROR: " params)
-	      ))))
-  ;; pass important messages to stderr
-  (if (and (eq? n 0)(not (eq? e (current-error-port)))) 
-      (with-output-to-port (current-error-port)
-	(lambda ()
-	  (apply print "ERROR: " params)
-	  ))))
-
-(define (debug:print-info n e . params)
-  (if (debug:debug-mode n)
-      (with-output-to-port (if (port? e) e (current-error-port))
-	(lambda ()
-	  (if *logging*
-	      (let ((res (format#format #f "INFO: (~a) ~a" n (apply conc params))))
-		(db:log-event res))
-	      ;; (apply print "pid:" (current-process-id) " " "INFO: (" n ") " params) ;; res)
-	      (apply print "INFO: (" n ") " params) ;; res)
-	      )))))
-
+;; (define (debug:print-error n e . params)
+;;   ;; normal print
+;;   (if (debug:debug-mode n)
+;;       (with-output-to-port (if (port? e) e (current-error-port))
+;; 	(lambda ()
+;; 	  (if *logging*
+;; 	      (db:log-event (apply conc params))
+;; 	      ;; (apply print "pid:" (current-process-id) " " params)
+;; 	      (apply print "ERROR: " params)
+;; 	      ))))
+;;   ;; pass important messages to stderr
+;;   (if (and (eq? n 0)(not (eq? e (current-error-port)))) 
+;;       (with-output-to-port (current-error-port)
+;; 	(lambda ()
+;; 	  (apply print "ERROR: " params)
+;; 	  ))))
+;; 
+;; (define (debug:print-info n e . params)
+;;   (if (debug:debug-mode n)
+;;       (with-output-to-port (if (port? e) e (current-error-port))
+;; 	(lambda ()
+;; 	  (if *logging*
+;; 	      (let ((res (format#format #f "INFO: (~a) ~a" n (apply conc params))))
+;; 		(db:log-event res))
+;; 	      ;; (apply print "pid:" (current-process-id) " " "INFO: (" n ") " params) ;; res)
+;; 	      (apply print "INFO: (" n ") " params) ;; res)
+;; 	      )))))
+;; 
 
 
 ;; if a value is printable (i.e. string or number) return the value
 ;; else return an empty string
 (define-inline (printable val)
   (if (or (number? val)(string? val)) val ""))
 

Index: commonmod.scm
==================================================================
--- commonmod.scm
+++ commonmod.scm
@@ -17,21 +17,77 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
 (declare (unit commonmod))
+(declare (uses debugprint))
 
 (use srfi-69)
 
 (module commonmod
 	*
 
-(import scheme chicken data-structures extras files)
-(import (prefix sqlite3 sqlite3:)
-	posix typed-records srfi-18 srfi-69
-	md5 message-digest
-	regex srfi-1)
+(import scheme)
+(cond-expand
+ (chicken-4
+  
+  (import chicken
+	  ports
+	  
+	  (prefix sqlite3 sqlite3:)
+	  data-structures
+	  extras
+	  files
+	  matchable
+	  md5
+	  message-digest
+	  pathname-expand
+	  posix
+	  posix-extras
+	  regex
+	  regex-case
+	  srfi-1
+	  srfi-18
+	  srfi-69
+	  typed-records
+
+	  debugprint
+	  )
+  (use srfi-69))
+ (chicken-5
+  (import (prefix sqlite3 sqlite3:)
+	  ;; data-structures
+	  ;; extras
+	  ;; files
+	  ;; posix
+	  ;; posix-extras
+	  chicken.base
+	  chicken.condition
+	  chicken.file
+	  chicken.file.posix
+	  chicken.io
+	  chicken.pathname
+	  chicken.process
+	  chicken.process-context
+	  chicken.process-context.posix
+	  chicken.sort
+	  chicken.string
+	  chicken.time
+	  chicken.time.posix
+	  
+	  matchable
+	  md5
+	  message-digest
+	  pathname-expand
+	  regex
+	  regex-case
+	  srfi-1
+	  srfi-18
+	  srfi-69
+	  typed-records
+	  system-information
+  )))
 
 ;;======================================================================
 ;; CONTENTS
 ;;
 ;;  config file utils
@@ -41,10 +97,16 @@
 ;;======================================================================
 
 (include "megatest-version.scm")
 (include "megatest-fossil-hash.scm")
 
+;; http - use the old http + in /tmp db
+;; tcp  - use tcp transport with cachedb db
+;; nfs  - use direct to disk access (read-only)
+;;
+(define rmt:transport-mode (make-parameter 'tcp))
+
 (define (get-full-version)
   (conc megatest-version "-" megatest-fossil-hash))
 
 (define (version-signature)
   (conc megatest-version "-" (substring megatest-fossil-hash 0 4)))
@@ -59,10 +121,22 @@
 	(begin
 	  (hash-table-set! *common:denoise* key currtime)
 	  #t)
 	#f)))
 
+;; KEEP THIS ONE
+;;
+;; client:get-signature
+
+(define *my-client-signature* #f)
+
+(define (client:get-signature)
+  (if *my-client-signature* *my-client-signature*
+      (let ((sig (conc (get-host-name) " " (current-process-id))))
+	(set! *my-client-signature* sig)
+	*my-client-signature*)))
+
 ;;======================================================================
 ;; config file utils
 ;;======================================================================
 
 (define (lookup cfgdat section var)
@@ -136,10 +210,58 @@
 
 ;;======================================================================
 ;; misc conversion, data manipulation functions
 ;;======================================================================
 
+;;======================================================================
+;; return first command that exists, else #f
+;;
+(define (common:which cmds)
+  (if (null? cmds)
+      #f
+      (let loop ((hed (car cmds))
+		 (tal (cdr cmds)))
+	(let ((res (with-input-from-pipe (conc "which " hed) read-line)))
+	  (if (and (string? res)
+		   (file-exists? res))
+	      res
+	      (if (null? tal)
+		  #f
+		  (loop (car tal)(cdr tal))))))))
+  
+(define (common:get-megatest-exe)
+  (let* ((mtexe (or (get-environment-variable "MT_MEGATEST")
+		    (common:which '("megatest"))
+		    "megatest")))
+    (if (file-exists? mtexe)
+	(realpath mtexe)
+	mtexe)))
+
+(define (common:get-megatest-exe-dir)
+  (let* ((mtexe (common:get-megatest-exe)))
+    (pathname-directory mtexe)))
+
+;; more generic and comprehensive version of get-megatest-exe
+;;
+(define (common:get-mtexe)
+  (let* ((mtpathdir  (common:get-megatest-exe-dir)))
+    (or (common:get-megatest-exe)
+	(if mtpathdir
+	    (conc mtpathdir"/megatest")
+	    #f)
+	"megatest")))
+
+(define (common:get-megatest-exe-path)
+  (let* ((mtpathdir (common:get-megatest-exe-dir)))
+    (conc mtpathdir":"(get-environment-variable "PATH") ":.")))
+
+(cond-expand
+ (chicken-4
+  (define (realpath x) (resolve-pathname  (pathname-expand (or x "/dev/null")) )))
+ (chicken-5
+  (define (realpath x) (normalize-pathname (pathname-expand (or x "/dev/null"))))))
+
 ;; if it looks like a number -> convert it to a number, else return it
 ;;
 (define (lazy-convert inval)
   (let* ((as-num (if (string? inval)(string->number inval) #f)))
     (or as-num inval)))
@@ -161,10 +283,40 @@
 	     (filter (lambda (x)
 		       (not (string-match "^\\s*" x)))
 		     val-list))
 	'())))
 
+(define (get-cpu-load)
+  (let* ((load-info (with-input-from-file "/proc/loadavg" read-lines)))
+    (map string->number (string-split load-info))))
+
+(define *current-host-cores* #f)
+
+(define (get-current-host-cores)
+  (or *current-host-cores*
+      (let ((cpu-info (with-input-from-file "/proc/cpuinfo" read-lines)))
+	(let loop ((lines cpu-info))
+	  (if (null? lines)
+	      1 ;; gotta be at least one!
+	      (let* ((inl (car lines))
+		     (tail (cdr lines))
+		     (parts (string-split inl)))
+		(match parts
+		  (("cpu" "cores" ":" num) (string->number num))
+		  (else (loop tail)))))))))
+
+(define (number-of-processes-running processname)
+  (with-input-from-pipe
+   (conc "ps -def | egrep \""processname"\" |wc -l")
+   (lambda ()
+     (string->number (read-line)))))
+
+;; get the normalized (i.e. load / numcpus) for *this* host
+;;
+(define (get-normalized-cpu-load)
+  (/ (get-cpu-load)(get-current-host-cores)))
+
 ;;======================================================================
 ;; testsuite and area utilites
 ;;======================================================================
 
 (define (get-testsuite-name toppath configdat)
@@ -208,18 +360,387 @@
   (let ((adat (get-section cfgdat "areas")))
     (map (lambda (entry)
 	   `(,(car entry) . 
 	     ,(val->alist (cadr entry))))
 	 adat)))
+
+;;======================================================================
+;; time utils
+;;======================================================================
+
+(define (common:human-time)
+  (time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S"))
+
+;;======================================================================
+;; T I M E   A N D   D A T E
+;;======================================================================
+
+;;======================================================================
+;; Convert strings like "5s 2h 3m" => 60x60x2 + 3x60 + 5
+(define (common:hms-string->seconds tstr)
+  (let ((parts     (string-split-fields "\\w+" tstr))
+	(time-secs 0)
+	;; s=seconds, m=minutes, h=hours, d=days, M=months, y=years, w=weeks
+	(trx       (regexp "^(\\d+)([smhdMyw])$")))
+    (for-each (lambda (part)
+		(let ((match  (string-match trx part)))
+		  (if match
+		      (let ((val (string->number (cadr match)))
+			    (unt (caddr match)))
+			(if val 
+			    (set! time-secs (+ time-secs (* val
+							    (case (string->symbol unt)
+							      ((s) 1)
+							      ((m) 60) ;; minutes
+							      ((h) 3600)
+							      ((d) 86400)
+							      ((w) 604800)
+							      ((M) 2628000) ;; aproximately one month
+							      ((y) 31536000)
+							      (else
+							       0)))))))
+		      ;; (print "ERROR: can't parse timestring "tstr", component "part)
+		      ;; can't (yet) use debugprint. rely on -show-config for user to find errors
+		      )))
+	      parts)
+    time-secs))
+		       
+(define (seconds->hr-min-sec secs)
+  (let* ((hrs (quotient secs 3600))
+	 (min (quotient (- secs (* hrs 3600)) 60))
+	 (sec (- secs (* hrs 3600)(* min 60))))
+    (conc (if (> hrs 0)(conc hrs "hr ") "")
+	  (if (> min 0)(conc min "m ")  "")
+	  sec "s")))
+
+(define (seconds->time-string sec)
+  (time->string 
+   (seconds->local-time sec) "%H:%M:%S"))
+
+(define (seconds->work-week/day-time sec)
+  (time->string
+   (seconds->local-time sec) "ww%V.%u %H:%M"))
+
+(define (seconds->work-week/day sec)
+  (time->string
+   (seconds->local-time sec) "ww%V.%u"))
+
+(define (seconds->year-work-week/day sec)
+  (time->string
+   (seconds->local-time sec) "%yww%V.%w"))
+
+(define (seconds->year-work-week/day-time sec)
+  (time->string
+   (seconds->local-time sec) "%Yww%V.%w %H:%M"))
+
+(define (seconds->year-week/day-time sec)
+  (time->string
+   (seconds->local-time sec) "%Yw%V.%w %H:%M"))
+
+(define (seconds->quarter sec)
+  (case (string->number
+	 (time->string 
+	  (seconds->local-time sec)
+	  "%m"))
+    ((1 2 3) 1)
+    ((4 5 6) 2)
+    ((7 8 9) 3)
+    ((10 11 12) 4)
+    (else #f)))
+
+;;======================================================================
+;; basic ISO8601 format (e.g. "2017-02-28 06:02:54") date time => Unix epoch
+;;
+(define (common:date-time->seconds datetime)
+  (local-time->seconds (string->time datetime "%Y-%m-%d %H:%M:%S")))
+
+;;======================================================================
+;; given span of seconds tstart to tend
+;; find start time to mark and mark delta
+;;
+(define (common:find-start-mark-and-mark-delta tstart tend)
+  (let* ((deltat   (- (max tend (+ tend 10)) tstart)) ;; can't handle runs of less than 4 seconds. Pad it to 10 seconds ...
+	 (result   #f)
+	 (min      60)
+	 (hr       (* 60 60))
+	 (day      (* 24 hr))
+	 (yr       (* 365 day)) ;; year
+	 (mo       (/ yr 12))
+	 (wk       (* day 7)))
+    (for-each
+     (lambda (max-blks)
+       (for-each
+	(lambda (span) ;; 5 2 1
+	  (if (not result)
+	      (for-each 
+	       (lambda (timeunit timesym) ;; year month day hr min sec
+		 (if (not result)
+		     (let* ((time-blk (* span timeunit))
+			    (num-blks (quotient deltat time-blk)))
+		       (if (and (> num-blks 4)(< num-blks max-blks))
+			   (let ((first (* (quotient tstart time-blk) time-blk)))
+			     (set! result (list span timeunit time-blk first timesym))
+			     )))))
+	       (list yr mo wk day hr min 1)
+	       '(     y  mo w  d   h  m   s))))
+	(list 8 6 5 2 1)))
+     '(5 10 15 20 30 40 50 500))
+    (if values
+	(apply values result)
+	(values 0 day 1 0 'd))))
+
+;;======================================================================
+;; given x y lim return the cron expansion
+;;
+(define (common:expand-cron-slash x y lim)
+  (let loop ((curr x)
+	     (res  `()))
+    (if (< curr lim)
+	(loop (+ curr y) (cons curr res))
+	(reverse res))))
+
+;;======================================================================
+;; expand a complex cron string to a list of cron strings
+;;
+;;  x/y   => x, x+y, x+2y, x+3y while x+Ny<max_for_field
+;;  a,b,c => a, b ,c
+;;
+;;   NOTE: with flatten a lot of the crud below can be factored down.
+;;
+(define (common:cron-expand cron-str)
+  (if (list? cron-str)
+      (flatten
+       (fold (lambda (x res)
+	       (if (list? x)
+		   (let ((newres (map common:cron-expand x)))
+		     (append x newres))
+		   (cons x res)))
+	     '()
+	     cron-str)) ;; (map common:cron-expand cron-str))
+      (let ((cron-items (string-split cron-str))
+	    (slash-rx   (regexp "(\\d+)/(\\d+)"))
+	    (comma-rx   (regexp ".*,.*"))
+	    (max-vals   '((min        . 60)
+			  (hour       . 24)
+			  (dayofmonth . 28) ;;; BUG!!!! This will be a bug for some combinations
+			  (month      . 12)
+			  (dayofweek  . 7))))
+	(if (< (length cron-items) 5) ;; bad spec
+	    cron-str ;; `(,cron-str)              ;; just return the string, something downstream will fix it
+	    (let loop ((hed  (car cron-items))
+		       (tal  (cdr cron-items))
+		       (type 'min)
+		       (type-tal '(hour dayofmonth month dayofweek))
+		       (res  '()))
+	      (regex-case
+		  hed
+		(slash-rx ( _ base incr ) (let* ((basen          (string->number base))
+						 (incrn          (string->number incr))
+						 (expanded-vals  (common:expand-cron-slash basen incrn (alist-ref type max-vals)))
+						 (new-list-crons (fold (lambda (x myres)
+									 (cons (conc (if (null? res)
+											 ""
+											 (conc (string-intersperse res " ") " "))
+										     x " " (string-intersperse tal " "))
+									       myres))
+								       '() expanded-vals)))
+					    ;; (print "new-list-crons: " new-list-crons)
+					    ;; (fold (lambda (x res)
+					    ;; 	    (if (list? x)
+					    ;; 		(let ((newres (map common:cron-expand x)))
+					    ;; 		  (append x newres))
+					    ;; 		(cons x res)))
+					    ;; 	  '()
+					    (flatten (map common:cron-expand new-list-crons))))
+		;;					    (map common:cron-expand (map common:cron-expand new-list-crons))))
+		(else (if (null? tal)
+			  cron-str
+			  (loop (car tal)(cdr tal)(car type-tal)(cdr type-tal)(append res (list hed)))))))))))
+
+;;======================================================================
+;; given a cron string and the last time event was processed return #t to run or #f to not run
+;;
+;;  min    hour   dayofmonth month  dayofweek
+;; 0-59    0-23   1-31       1-12   0-6          ### NOTE: dayofweek does not include 7
+;;
+;;  #t => yes, run the job
+;;  #f => no, do not run the job
+;;
+(define (common:cron-event cron-str now-seconds-in last-done) ;; ref-seconds = #f is NOW.
+  (let* ((cron-items     (map string->number (string-split cron-str)))
+	 (now-seconds    (or now-seconds-in (current-seconds)))
+	 (now-time       (seconds->local-time now-seconds))
+	 (last-done-time (seconds->local-time last-done))
+	 (all-times      (make-hash-table)))
+    ;; (print "cron-items: " cron-items "(length cron-items): " (length cron-items))
+    (if (not (eq? (length cron-items) 5)) ;; don't even try to figure out junk strings
+	#f
+	(match-let (((     cmin chour cdayofmonth cmonth    cdayofweek)
+		     cron-items)
+		    ;; 0     1    2        3         4    5      6
+		    ((nsec nmin nhour ndayofmonth nmonth nyr ndayofweek n7 n8 n9)
+		     (vector->list now-time))
+		    ((lsec lmin lhour ldayofmonth lmonth lyr ldayofweek l7 l8 l9)
+		     (vector->list last-done-time)))
+	  ;; create all possible time slots
+	  ;; remove invalid slots due to (for example) day of week
+	  ;; get the start and end entries for the ref-seconds (current) time
+	  ;; if last-done > ref-seconds => this is an ERROR!
+	  ;; does the last-done time fall in the legit region?
+	  ;;    yes => #f  do not run again this command
+	  ;;    no  => #t  ok to run the command
+	  (for-each ;; month
+	   (lambda (month)
+	     (for-each ;; dayofmonth
+	      (lambda (dom)
+		(for-each
+		 (lambda (hr) ;; hour
+		   (for-each
+		    (lambda (minute) ;; minute
+		      (let ((copy-now (apply vector (vector->list now-time))))
+			(vector-set! copy-now 0 0) ;; force seconds to zero
+			(vector-set! copy-now 1 minute)
+			(vector-set! copy-now 2 hr)
+			(vector-set! copy-now 3 dom)  ;; dom is already corrected for zero referenced
+			(vector-set! copy-now 4 month)
+			(let* ((copy-now-secs (local-time->seconds copy-now))
+			       (new-copy      (seconds->local-time copy-now-secs))) ;; remake the time vector
+			  (if (or (not cdayofweek)
+				  (equal? (vector-ref new-copy 6)
+					  cdayofweek)) ;; if the day is specified and a match OR if the day is NOT specified
+			      (if (or (not cdayofmonth)
+				      (equal? (vector-ref new-copy 3)
+					      (+ 1 cdayofmonth))) ;; if the month is specified and a match OR if the month is NOT specified
+				  (hash-table-set! all-times copy-now-secs new-copy))))))
+		    (if cmin
+			`(,cmin)  ;; if given cmin, have to use it
+			(list (- nmin 1) nmin (+ nmin 1))))) ;; minute
+		 (if chour
+		     `(,chour)
+		     (list (- nhour 1) nhour (+ nhour 1))))) ;; hour
+	      (if cdayofmonth
+		  `(,cdayofmonth)
+		  (list (- ndayofmonth 1) ndayofmonth (+ ndayofmonth 1)))))
+	   (if cmonth
+	       `(,cmonth)
+	       (list (- nmonth 1) nmonth (+ nmonth 1))))
+	  (let ((before #f)
+		(is-in  #f))
+	    (for-each
+	     (lambda (moment)
+	       (if (and before
+			(<= before now-seconds)
+			(>= moment now-seconds))
+		   (begin
+		     ;; (print)
+		     ;; (print "Before: " (time->string (seconds->local-time before)))
+		     ;; (print "Now:    " (time->string (seconds->local-time now-seconds)))
+		     ;; (print "After:  " (time->string (seconds->local-time moment)))
+		     ;; (print "Last:   " (time->string (seconds->local-time last-done)))
+		     (if (<  last-done before)
+			 (set! is-in before))
+		     ))
+	       (set! before moment))
+	     (sort (hash-table-keys all-times) <))
+	    is-in)))))
+
+(define (common:extended-cron  cron-str now-seconds-in last-done)
+  (let ((expanded-cron (common:cron-expand cron-str)))
+    (if (string? expanded-cron)
+	(common:cron-event expanded-cron now-seconds-in last-done)
+	(let loop ((hed (car expanded-cron))
+		   (tal (cdr expanded-cron)))
+	  (if (common:cron-event hed now-seconds-in last-done)
+	      #t
+	      (if (null? tal)
+		  #f
+		  (loop (car tal)(cdr tal))))))))
+
+
 
 ;;======================================================================
 ;; misc stuff
 ;;======================================================================
 
-;; (define (debug:print . params) #f)
-;; (define (debug:print-info . params) #f)
-;; 
-;; (define (set-functions dbgp dbgpinfo)
-;;   (set! debug:print dbgp)
-;;   (set! debug:print-info dbgpinfo))
+(define (common:get-signature str)
+  (message-digest-string (md5-primitive) str))
+
+;;======================================================================
+;; hash of hashs
+;;======================================================================
+
+(define (db:hoh-set! dat key1 key2 val)
+  (let* ((subhash (hash-table-ref/default dat key1 #f)))
+    (if subhash
+	(hash-table-set! subhash key2 val)
+	(begin
+	  (hash-table-set! dat key1 (make-hash-table))
+	  (db:hoh-set! dat key1 key2 val)))))
+
+(define (db:hoh-get dat key1 key2)
+  (let* ((subhash (hash-table-ref/default dat key1 #f)))
+    (and subhash
+	 (hash-table-ref/default subhash key2 #f))))
+
+;;======================================================================
+;; when called from a wrapper I need sometimes to find the calling
+;; wrapper, this is for dashboard to find the correct megatest.
+;;
+(define (common:find-local-megatest #!optional (progname "megatest"))
+  (let ((res (filter file-exists?
+		     (map (lambda (updir)
+			    (let* ((lm  (car (argv)))
+				   (dir (pathname-directory lm))
+				   (exe (pathname-strip-directory lm)))
+			      (conc (if dir (conc dir "/") "")
+				    (case (string->symbol exe)
+				      ((dboard)    (conc updir progname))
+				      ((mtest)     (conc updir progname))
+				      ((dashboard) progname)
+				      (else exe)))))
+			  '("../../" "../")))))
+    (if (null? res)
+	(begin
+	  ;; (debug:print 0 *default-log-port* "Failed to find this executable! Using what can be found on the path")
+	  progname)
+	(car res))))
+
+(define (common:generic-ssh ssh-command proc default #!optional (msg-proc #f))
+  (let ((inp #f))
+    (handle-exceptions
+	exn
+      (begin
+	(close-input-port inp)
+	(if msg-proc
+	    (msg-proc)
+	    (debug:print 0 *default-log-port* "Command: \""ssh-command"\" failed. exn="exn))
+	default)
+      (set! inp (open-input-pipe ssh-command))
+      (with-input-from-port inp
+	(lambda ()
+	  (let ((res (proc)))
+	    (close-input-port inp)
+	    res))))))
+
+;; this is a close duplicate of:
+;;    process:alist-on-host?
+;;    process:alive
+;;
+(define (commonmod:is-test-alive host pid)
+  (let* ((same-host (equal? host (get-host-name)))
+	 (cmd (conc 
+	       (if same-host "" (conc "ssh "host" "))
+	       "pstree -A "pid)))
+    (if (and host pid
+	     (not (equal? host "n/a")))
+	
+	(let* ((output (if same-host
+			   (with-input-from-pipe cmd read-lines)
+			   (common:generic-ssh cmd read-lines '())))) ;; (with-input-from-pipe cmd read-lines)))
+	  (debug:print 2 *default-log-port* "Running " cmd " received " output)
+	  (if (eq? (length output) 0)
+	      #f
+	      #t))
+	#t))) ;; assuming bad query is about a live test is likely not the right thing to do?
+
 
 )

Index: configf.scm
==================================================================
--- configf.scm
+++ configf.scm
@@ -25,10 +25,19 @@
 (use regex regex-case matchable) ;;  directory-utils)
 (declare (unit configf))
 (declare (uses process))
 (declare (uses env))
 (declare (uses keys))
+(declare (uses debugprint))
+(declare (uses mtargs))
+(declare (uses mtargs.import))
+(declare (uses common))
+(declare (uses commonmod))
+(declare (uses commonmod.import))
+(import commonmod
+	(prefix mtargs args:)
+	debugprint)
 
 (include "common_records.scm")
 
 ;; return list (path fullpath configname)
 (define (find-config configname #!key (toppath #f))
@@ -98,10 +107,12 @@
 
 (define (configf:system ht cmd)
   (system cmd)
   )
 
+(define configf:imports "(import commonmod (prefix mtargs args:))")
+
 (define (configf:process-line l ht allow-system #!key (linenum #f))
   (let loop ((res l))
     (if (string? res)
 	(let ((matchdat (string-search configf:var-expand-regex res)))
 	  (if matchdat
@@ -111,11 +122,11 @@
 		     (poststr (list-ref matchdat 4))
 		     (result  #f)
 		     (start-time (current-seconds))
 		     (cmdsym  (string->symbol cmdtype))
 		     (fullcmd (case cmdsym
-				((scheme scm) (conc "(lambda (ht)" cmd ")"))
+				((scheme scm) (conc "(lambda (ht)(begin " configf:imports cmd "))"))
 				((system)     (conc "(lambda (ht)(configf:system ht \"" cmd "\"))"))
 				((shell sh)   (conc "(lambda (ht)(string-translate (shell \""  cmd "\") \"\n\" \" \"))"))
 				((realpath rp)(conc "(lambda (ht)(common:nice-path \"" cmd "\"))"))
 				((getenv gv)  (conc "(lambda (ht)(get-environment-variable \"" cmd "\"))"))
 				((mtrah)      (conc "(lambda (ht)"

Index: dashboard-context-menu.scm
==================================================================
--- dashboard-context-menu.scm
+++ dashboard-context-menu.scm
@@ -21,32 +21,38 @@
 ;;======================================================================
 ;; implementation of context menu that pops up on
 ;; right click on test cell in Runs & Runs Summary Tabs
 ;;======================================================================
 
+(declare (unit dashboard-context-menu))
+(declare (uses common))
+(declare (uses commonmod))
+(declare (uses db))
+(declare (uses gutils))
+(declare (uses rmt))
+(declare (uses rmtmod))
+(declare (uses ezsteps))
+;; (declare (uses sdb))
+;; (declare (uses filedb))
+(declare (uses subrun))
+
 (use format fmt)
 (require-library iup)
 (import (prefix iup iup:))
 
 (use canvas-draw)
 
 (use srfi-1 posix regex regex-case srfi-69)
 (use (prefix sqlite3 sqlite3:))
 
-(declare (unit dashboard-context-menu))
-(declare (uses common))
-(declare (uses db))
-(declare (uses gutils))
-(declare (uses rmt))
-(declare (uses ezsteps))
-;; (declare (uses sdb))
-;; (declare (uses filedb))
-(declare (uses subrun))
-
 (include "common_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
+
+(import commonmod
+	rmtmod
+	debugprint)
 
 (define (dboard:launch-testpanel run-id test-id)
   (let* ((dboardexe (common:find-local-megatest "dashboard"))
          (cmd (conc dboardexe
                     " -test " run-id "," test-id

Index: dashboard-guimonitor.scm
==================================================================
--- dashboard-guimonitor.scm
+++ dashboard-guimonitor.scm
@@ -34,10 +34,12 @@
 (declare (unit dashboard-guimonitor))
 (declare (uses common))
 (declare (uses keys))
 (declare (uses db))
 (declare (uses tasks))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
 (include "task_records.scm")

Index: dashboard-tests.scm
==================================================================
--- dashboard-tests.scm
+++ dashboard-tests.scm
@@ -20,28 +20,33 @@
 
 ;;======================================================================
 ;; Test info panel
 ;;======================================================================
 
+(declare (unit dashboard-tests))
+(declare (uses common))
+(declare (uses commonmod))
+(declare (uses db))
+(declare (uses gutils))
+(declare (uses rmt))
+(declare (uses ezsteps))
+(declare (uses subrun))
+(declare (uses debugprint))
+(declare (uses rmtmod))
+
 (use format fmt)
 (require-library iup)
 (import (prefix iup iup:))
 
 (use canvas-draw)
 
 (use srfi-1 posix regex regex-case srfi-69)
 (use (prefix sqlite3 sqlite3:))
 
-(declare (unit dashboard-tests))
-(declare (uses common))
-(declare (uses db))
-(declare (uses gutils))
-(declare (uses rmt))
-(declare (uses ezsteps))
-;; (declare (uses sdb))
-;; (declare (uses filedb))
-(declare (uses subrun))
+(import commonmod
+	rmtmod
+	debugprint)
 
 (include "common_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
 
@@ -459,12 +464,11 @@
 ;;======================================================================
 ;;
 ;;======================================================================
 (define (dashboard-tests:examine-test run-id test-id) ;; run-id run-key origtest)
   (let* ((db-path       (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
-	 (dbstruct      #f) ;; NOT ACTUALLY USED (db:setup)) ;; (make-dbr:dbstruct path:  (db:dbfile-path #f) ;; (configf:lookup *configdat* "setup" "linktree") 
-			    ;;		   local: #t))
+	 (dbstruct      #f) ;; NOT USED
 	 (testdat        (rmt:get-test-info-by-id run-id test-id)) ;; (db:get-test-info-by-id dbstruct run-id test-id))
 	 (db-mod-time   0) ;; (file-modification-time db-path))
 	 (last-update   0) ;; (current-seconds))
 	 (request-update #t))
     (if (not testdat)

ADDED   dashboard-transport-mode.scm.template
Index: dashboard-transport-mode.scm.template
==================================================================
--- /dev/null
+++ dashboard-transport-mode.scm.template
@@ -0,0 +1,22 @@
+;;======================================================================
+;; set up transport, db cache and sync methods
+;;
+;; sync-method:        'original, 'attach or 'none
+;; cache-method:       'tmp or 'none
+;; rmt:transport-mode: 'http, 'tcp, 'nfs
+;;
+;; NOTE: NOT ALL COMBINATIONS WORK
+;;
+;;======================================================================
+
+;; uncomment this block to test without tcp or cachedb
+;; (dbfile:sync-method 'none)
+;; (dbfile:cache-method 'none)
+;; (rmt:transport-mode 'nfs)
+
+;; uncomment this block to test with tcp and cachedb
+(dbfile:sync-method 'none) ;; original was causing crash on start. 
+(dbfile:cache-method 'none)
+(rmt:transport-mode 'nfs)
+
+

Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -16,26 +16,17 @@
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 ;;======================================================================
 
-(use format)
-
-(require-library iup)
-(import (prefix iup iup:))
-
-(use canvas-draw)
-(import canvas-draw-iup)
-(use ducttape-lib)
-(use sqlite3 srfi-1 posix regex regex-case srfi-69 typed-records sparse-vectors) ;; defstruct
-(import (prefix sqlite3 sqlite3:))
-(import dbfile)
-
 (declare (uses common))
-(declare (uses margs))
+(declare (uses mtargs))
+(declare (uses mtargs.import))
 (declare (uses keys))
 (declare (uses items))
+(declare (uses debugprint))
+(declare (uses debugprint.import))
 (declare (uses db))
 (declare (uses configf))
 (declare (uses process))
 (declare (uses launch))
 (declare (uses runs))
@@ -44,21 +35,49 @@
 (declare (uses dcommon))
 (declare (uses dashboard-context-menu))
 (declare (uses vg))
 (declare (uses subrun))
 (declare (uses mt))
-(declare (uses dbfile))        
+(declare (uses dbmod))
+(declare (uses rmtmod))
+(declare (uses dbfile))
+(declare (uses commonmod))
+(declare (uses commonmod.import))
+
+(use format)
+
+(require-library iup)
+(import (prefix iup iup:))
+
+(use canvas-draw)
+(import canvas-draw-iup
+	(prefix sqlite3 sqlite3:))
+
+(use ducttape-lib)
+(use sqlite3 srfi-1 posix regex regex-case srfi-69 typed-records sparse-vectors) ;; defstruct
+
+(import commonmod
+	(prefix mtargs args:)
+	dbmod
+	dbfile
+	rmtmod
+	debugprint)
 
 (include "common_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
 (include "task_records.scm")
 (include "megatest-version.scm")
 (include "megatest-fossil-hash.scm")
 (include "vg_records.scm")
 
+;; set some parameters here - these need to be put in something that can be loaded from other
+;; executables such as dashboard and mtutil
+;;
+(include "dashboard-transport-mode.scm")
 (dbfile:db-init-proc db:initialize-main-db)
+(set! rmtmod:send-receive rmt:send-receive)
 
 (define help (conc 
 	      "Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version 
               " license GPL, Copyright (C) Matt Welland 2012-2017
 
@@ -70,10 +89,11 @@
   -cols C         : set number of columns
   -start-dir dir  : start dashboard in the given directory
   -target target  : filter runs tab to given target.
   -debug  n[,n]   : set debug level(s) e.g. -debug 4 or -debug 0,9
   -repl           : Start a chicken scheme interpreter
+  -mode MODE      : tcp or nfs
 "
 ))
 
 
 ;; process args
@@ -84,10 +104,11 @@
 			"-cols"
 			"-test" ;; given a run id and test id, open only a test control panel on that test..
 			"-debug"
                         "-start-dir"
                         "-target"
+			"-mode"  ;; tcp or nfs
 			) 
                  ;; switches (don't take arguments)
 		 (list  "-h"
 			"-skip-version-check"
 			"-repl"
@@ -94,11 +115,17 @@
 			"-:p"     ;; ignore the built in chicken profiling switch
 			)
 		 args:arg-hash
 		 0))
 
-    
+(if (args:get-arg "-mode")
+    (let* ((mode (string->symbol (args:get-arg "-mode"))))
+      (rmt:transport-mode mode)))
+
+(if (args:get-arg "-test") ;; need to use tcp for test control panel
+    (rmt:transport-mode 'tcp))
+
 ;; RA => Might require revert for filters 
 ;; create a watch dog to move changes from lt/.db/*.db to megatest.db
 ;;
 ;;;(if (file-write-access? (conc *toppath* "/megatest.db"))
 ;;(debug:print-info 13 *default-log-port* "Before common:watchdog spawn")
@@ -204,11 +231,15 @@
    tabdat))
 
 ;; gets and calls updater list based on curr-tab-num
 ;;
 (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
-  (sync-db-to-tmp (dboard:common-get-tabdat commondat tab-num: tab-num))
+  ;; (sync-db-to-tmp (dboard:common-get-tabdat commondat tab-num: tab-num)) ;; no longer applies
+
+  ;; maybe need sleep here?
+
+  
   (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
       (let* ((tnum     (or tab-num (dboard:commondat-curr-tab-num commondat)))
 	     (updaters (hash-table-ref/default (dboard:commondat-updaters commondat)
 					       tnum
 					       '())))
@@ -341,18 +372,18 @@
   tests-tree       ;; used in newdashboard
   )
 
 ;; register tabdat with BBpp
 ;; this is used by BBpp (Brandon's pretty printer) to convert dboard:tabdat into a composition of lists that pp will handle
-(hash-table-set! *BBpp_custom_expanders_list* TABDAT:
-                 (cons dboard:tabdat?
-                       (lambda (tabdat-item)
-                         (filter
-                          (lambda (alist-entry)
-                            (member (car alist-entry)
-                                    '(allruns-by-id allruns))) ;; FIELDS OF INTEREST
-                          (dboard:tabdat->alist tabdat-item)))))
+;; (hash-table-set! *BBpp_custom_expanders_list* TABDAT:
+;;                  (cons dboard:tabdat?
+;;                        (lambda (tabdat-item)
+;;                          (filter
+;;                           (lambda (alist-entry)
+;;                             (member (car alist-entry)
+;;                                     '(allruns-by-id allruns))) ;; FIELDS OF INTEREST
+;;                           (dboard:tabdat->alist tabdat-item)))))
 
 
 
 (define (dboard:tabdat-target-string vec)
   (let ((targ (dboard:tabdat-target vec)))
@@ -405,27 +436,29 @@
   (make-dboard:runsdat
    runs-index: (make-hash-table)
    tests-index: (make-hash-table)
    matrix-dat: (make-sparse-array)))
 
-;; used to keep the rundata from rmt:get-tests-for-run
-;; in sync. 
+;; duplicated in dcommon.scm
 ;;
-(defstruct dboard:rundat
-  run
-  tests-drawn    ;; list of id's already drawn on screen
-  tests-notdrawn ;; list of id's NOT already drawn
-  rowsused       ;; hash of lists covering what areas used - replace with quadtree
-  hierdat        ;; put hierarchial sorted list here
-  tests          ;; hash of id => testdat
-  ((tests-by-name (make-hash-table)) : hash-table) ;; hash of testfullname => testdat
-  key-vals
-  ((last-update   0)                 : number)    ;; last query to db got records from before last-update
-  ((last-db-time  0)                 : number)    ;; last timestamp on main.db
-  ((data-changed  #f)                : boolean)   
-  ((run-data-offset  0)              : number)      ;; get only 100 items per call, set back to zero when received less than 100 items
-  (db-path #f))
+;; ;; used to keep the rundata from rmt:get-tests-for-run
+;; ;; in sync. 
+;; ;;
+;; (defstruct dboard:rundat
+;;   run
+;;   tests-drawn    ;; list of id's already drawn on screen
+;;   tests-notdrawn ;; list of id's NOT already drawn
+;;   rowsused       ;; hash of lists covering what areas used - replace with quadtree
+;;   hierdat        ;; put hierarchial sorted list here
+;;   tests          ;; hash of id => testdat
+;;   ((tests-by-name (make-hash-table)) : hash-table) ;; hash of testfullname => testdat
+;;   key-vals
+;;   ((last-update   0)                 : number)    ;; last query to db got records from before last-update
+;;   ((last-db-time  0)                 : number)    ;; last timestamp on main.db
+;;   ((data-changed  #f)                : boolean)   
+;;   ((run-data-offset  0)              : number)      ;; get only 100 items per call, set back to zero when received less than 100 items
+;;   (db-path #f))
 
 ;; for the new runs view lets build up a few new record types and then consolidate later
 ;;
 ;; this is a two level deep pipeline for the incoming data:
 ;;   sql query data ==> filters ==> data for display
@@ -491,18 +524,18 @@
   duration
   )
 
 ;; register dboard:rundat with BBpp
 ;; this is used by BBpp (Brandon's pretty printer) to convert dboard:rundat into a composition of lists that pp will handle
-(hash-table-set! *BBpp_custom_expanders_list* RUNDAT:
-                 (cons dboard:rundat?
-                       (lambda (tabdat-item)
-                         (filter
-                          (lambda (alist-entry)
-                            (member (car alist-entry)
-                                    '(run run-data-offset ))) ;; FIELDS OF INTEREST
-                          (dboard:rundat->alist tabdat-item)))))
+;; (hash-table-set! *BBpp_custom_expanders_list* RUNDAT:
+;;                  (cons dboard:rundat?
+;;                        (lambda (tabdat-item)
+;;                          (filter
+;;                           (lambda (alist-entry)
+;;                             (member (car alist-entry)
+;;                                     '(run run-data-offset ))) ;; FIELDS OF INTEREST
+;;                           (dboard:rundat->alist tabdat-item)))))
 
 
 
 
 (define (dboard:rundat-make-init #!key (run #f)(key-vals #f)(tests #f));; -100 is before time began
@@ -640,11 +673,11 @@
 	 (access-mode  (dboard:tabdat-access-mode tabdat))
          (num-to-get   (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
                                            "200")))
 	 (states       (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
 	 (statuses     (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
-         (do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
+         (do-not-use-db-file-timestamps #f) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
          (do-not-use-query-timestamps   #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
 	 (sort-info    (get-curr-sort))
 	 (sort-by      (vector-ref sort-info 1))
 	 (sort-order   (vector-ref sort-info 2))
 	 (bubble-type  (if (member sort-order '(testname))
@@ -663,12 +696,12 @@
                         (dboard:rundat-last-update run-dat)))
 	 (last-db-time (if do-not-use-db-file-timestamps
 			   0
 			   (dboard:rundat-last-db-time run-dat)))
 	 (db-path      (or (dboard:rundat-db-path run-dat)
-			   (let* ((db-dir (common:get-db-tmp-area))
-				  (db-pth (conc db-dir "/.megatest/main.db")))
+			   (let* ((db-dir (get-environment-variable "MT_RUN_AREA_HOME")) ;;  (common:get-db-tmp-area))
+				  (db-pth (conc db-dir "/.mtdb/main.db")))
 			     (dboard:rundat-db-path-set! run-dat db-pth)
 			     db-pth)))
 	 (db-mod-time  (common:lazy-sqlite-db-modification-time db-path))
 	 (db-modified  (>= db-mod-time last-db-time))
 	 (multi-get    (> (dboard:rundat-run-data-offset run-dat) 0))  ;; multi-get in progress
@@ -1076,11 +1109,11 @@
 ;;  - not appropriate for where all the runs are needed
 ;;
 (define (update-buttons tabdat uidat numruns numtests)
   (let* ((runs        (if (> (length (dboard:tabdat-allruns tabdat)) numruns)
 			  (take-right (dboard:tabdat-allruns tabdat) numruns)
-			  (pad-list (dboard:tabdat-allruns tabdat) numruns)))
+			  (pad-list   (dboard:tabdat-allruns tabdat) numruns)))
 	 (lftcol      (dboard:uidat-get-lftcol uidat))
 	 (tableheader (dboard:uidat-get-header uidat))
 	 (table       (dboard:uidat-get-runsvec uidat))
 	 (coln        0)
 	 (all-test-names (make-hash-table))
@@ -3100,21 +3133,18 @@
   (hash-table-ref/default (dboard:tabdat-last-db-update tabdat) context 0))
 
 (define (dboard:set-last-db-update! tabdat context newtime)
   (hash-table-set! (dboard:tabdat-last-db-update tabdat) context newtime))
 
-;; DOES NOT WORK RELIABLY WITH /tmp WAL mode files. Timestamps only change when the db
-;; is closed (I think). If db dir starts with /tmp always return true
 ;;
 (define (dashboard:database-changed? commondat tabdat #!key (context-key 'default))
   (let* ((run-update-time (current-seconds))
-	 (dbdir           (dboard:tabdat-dbdir tabdat))
+         (dbdir           *toppath*)
 	 (modtime         (dashboard:get-youngest-run-db-mod-time dbdir))
 	 (recalc          (dashboard:recalc modtime 
 					    (dboard:commondat-please-update commondat) 
 					    (dboard:get-last-db-update tabdat context-key))))
-    ;; (dboard:tabdat-last-db-update tabdat))))
     (if recalc 
 	(dboard:set-last-db-update! tabdat context-key run-update-time))
     (dboard:commondat-please-update-set! commondat #f)
     recalc))
 
@@ -3792,11 +3822,11 @@
 (stop-the-train)
 
 (define (main)
   ;; (print "Starting dashboard main")
     
-  (let* ((mtdb-path (conc *toppath* "/.megatest/main.db"))
+  (let* ((mtdb-path (conc *toppath* "/.mtdb/main.db"))
          (target (args:get-arg "-target"))
          (commondat       (dboard:commondat-make)))
     (if target
         (begin
           (args:remove-arg-from-ht "-target")
@@ -3889,11 +3919,11 @@
 
 
 ;; Sync to tmp only if in read-only mode.
 
 (define (sync-db-to-tmp tabdat)
-  (let* ((db-file "./.megatest/main.db"))
+  (let* ((db-file "./.mtdb/main.db"))
     (if (and (not (file-write-access? db-file)) ( > (current-seconds) (+ last-copy-time 5)))
       (begin
         (db:multi-db-sync (db:setup #f) 'old2new)
         (set! last-copy-time (current-seconds))
       )

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -22,13 +22,28 @@
 ;; Database access
 ;;======================================================================
 
 ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc
 
+(declare (unit db))
+(declare (uses common))
+(declare (uses debugprint))
+(declare (uses dbmod))
+(declare (uses dbfile))
+(declare (uses keys))
+(declare (uses ods))
+(declare (uses mt))
+(declare (uses commonmod))
+(declare (uses mtargs))
+(declare (uses rmtmod))
+
+(import commonmod
+	(prefix mtargs args:))
+
 (use (srfi 18)
      extras
-     tcp
+     ;; tcp
      stack
      (prefix sqlite3 sqlite3:)
      srfi-1
      posix
      regex
@@ -44,39 +59,37 @@
      z3
      typed-records
      matchable
      files)
 
-(declare (unit db))
-(declare (uses common))
-(declare (uses dbmod))
-;; (declare (uses debugprint))
-(declare (uses dbfile))
-(declare (uses keys))
-(declare (uses ods))
-(declare (uses client))
-(declare (uses mt))
-
 (include "common_records.scm")
 (include "db_records.scm")
 (include "key_records.scm")
 (include "run_records.scm")
 
 (define *number-of-writes* 0)
 (define *number-non-write-queries* 0)
 
-(import dbmod)
+(import debugprint)
 (import dbfile)
+(import dbmod)
+(import rmtmod)
 
 ;; record for keeping state,status and count for doing roll-ups in
 ;; iterated tests
 ;;
 (defstruct dbr:counts
   (state #f)
   (status #f)
   (count  0)) 
 
+;; (define (db:with-db dbstruct run-id r/w proc . params)
+;;   (case (rmt:transport-mode)
+;;     ((http)(dbfile:with-db dbstruct run-id r/w proc params))
+;;     ((tcp) (dbmod:with-db dbstruct run-id r/w proc params))
+;;     ((nfs) (dbmod:with-db dbstruct run-id r/w proc params))
+;;     (else (assert #f "FATAL: db:with-db called with non-existant transport mode"))))
 
 ;;======================================================================
 ;; hash of hashs
 ;;======================================================================
 
@@ -92,19 +105,10 @@
 (define (db:hoh-get dat key1 key2)
   (let* ((subhash (hash-table-ref/default dat key1 #f)))
     (and subhash
 	 (hash-table-ref/default subhash key2 #f))))
 
-(define (db:get-cache-stmth dbdat run-id db stmt)
-  (let* (;; (dbdat       (dbfile:get-dbdat dbstruct run-id))
-	 (stmt-cache  (dbr:dbdat-stmt-cache dbdat))
-	 (stmth       (db:hoh-get stmt-cache db stmt)))
-    (or stmth
-	(let* ((newstmth (sqlite3:prepare db stmt)))
-	  (db:hoh-set! stmt-cache db stmt newstmth)
-	  newstmth))))
-
 ;;======================================================================
 ;; SQLITE3 HELPERS
 ;;======================================================================
 
 (define (db:general-sqlite-error-dump exn stmt . params)
@@ -133,10 +137,83 @@
   (assert *toppath* "FATAL: db:setup called before launch:setup has been run.")
   (let* ((tmpdir (common:get-db-tmp-area)))
     (if (not *dbstruct-dbs*)
 	(dbfile:setup do-sync *toppath* tmpdir)
 	*dbstruct-dbs*)))
+
+;; moved from dbfile
+;;
+;; ADD run-id SUPPORT
+;;
+(define (db:create-all-triggers dbstruct)
+  (db:with-db
+   dbstruct #f #t
+   (lambda (dbdat db)
+     (db:create-triggers db))))
+
+(define (db:create-triggers db)
+    (for-each (lambda (key)
+              (sqlite3:execute db (cadr key)))
+          db:trigger-list))
+
+(define (db:drop-all-triggers dbstruct)
+  (db:with-db
+   dbstruct #f #t
+   (lambda (dbdat db)
+     (db:drop-triggers db))))
+
+(define (db:have-incompletes? dbstruct run-id ovr-deadtime)
+  (let* ((incompleted '())
+	 (oldlaunched '())
+	 (toplevels   '())
+	 ;; (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
+	 (deadtime  (or ovr-deadtime 72000))) ;; twenty hours
+    (db:with-db
+     dbstruct run-id #f
+     (lambda (dbdat db)
+       
+       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
+       ;;
+       ;; HOWEVER: this code in run:test seems to work fine
+       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
+       ;;                     (db:test-get-run_duration testdat)))
+       ;;                    600) 
+       ;; (db:delay-if-busy dbdat)
+       (sqlite3:for-each-row 
+        (lambda (test-id run-dir uname testname item-path)
+          (if (and (equal? uname "n/a")
+                   (equal? item-path "")) ;; this is a toplevel test
+              ;; what to do with toplevel? call rollup?
+              (begin
+                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels)))
+                ;; (print-info "Found old toplevel test in RUNNING state, test-id=" test-id))
+              (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
+        (db:get-cache-stmth dbdat db
+        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');")
+        run-id deadtime)
+
+       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
+       ;;
+       ;; (db:delay-if-busy dbdat)
+       (sqlite3:for-each-row
+        (lambda (test-id run-dir uname testname item-path)
+          (if (and (equal? uname "n/a")
+                   (equal? item-path "")) ;; this is a toplevel test
+              ;; what to do with toplevel? call rollup?
+              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+              (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
+        (db:get-cache-stmth dbdat db
+        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');")
+        run-id)
+       
+       ;; (print-info "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
+       (if (and (null? incompleted)
+                (null? oldlaunched)
+                (null? toplevels))
+           #f
+           #t)))))
+
 
 ;; looks up subdb and returns it, if not found then set up
 ;; and then return it.
 ;;
 #;(define (db:get-db dbstruct run-id)
@@ -150,12 +227,12 @@
 
 ;; Get/open a database
 ;;    if run-id => get run specific db
 ;;    if #f     => get main db
 ;;    if run-id is a string treat it as a filename
-;;    if db already open - return inmem
-;;    if db not open, open inmem, rundb and sync then return inmem
+;;    if db already open - return cachedb
+;;    if db not open, open cachedb, rundb and sync then return cachedb
 ;;    inuse gets set automatically for rundb's
 ;;
 ;; (define db:get-db db:get-subdb)
 
 ;; (define (db:get-db subdb #;dbstruct run-id) ;; RENAME TO db:get-dbh
@@ -359,13 +436,15 @@
 (define (db:cache-for-read-only source target #!key (use-last-update #f))
   (if (and (hash-table-ref/default *global-db-store* target #f)
 	   (>= (file-modification-time target)(file-modification-time source)))
       (hash-table-ref *global-db-store* target)
       (let* ((toppath   (launch:setup))
-	     (targ-db-last-mod (if (common:file-exists? target)
-				   (file-modification-time target)
-				   0))
+	     (targ-db-last-mod (db:get-sqlite3-mod-time target))
+;;	      (if (common:file-exists? target)
+;; BUG: This needs to include wal mode stuff .shm etc.
+;;				   (file-modification-time target)
+;;				   0))
 	     (cache-db  (or (hash-table-ref/default *global-db-store* target #f)
 			    (db:open-megatest-db path: target)))
 	     (source-db (db:open-megatest-db path: source))
 	     (curr-time (current-seconds))
 	     (res      '())
@@ -373,95 +452,99 @@
 	(db:sync-tables (db:sync-main-list source-db) last-update source-db cache-db)
 	(db:sync-tables db:sync-tests-only last-update source-db cache-db)
 	(hash-table-set! *global-db-store* target cache-db)
 	cache-db)))
 
-;; ;; call a proc with a cached db
-;; ;;
-;; (define (db:call-with-cached-db proc . params)
-;;   ;; first cache the db in /tmp
-;;   (let* ((cname-part (conc "megatest_cache/" (common:get-testsuite-name)))
-;; 	 (fname      (conc  (common:get-area-path-signature) ".db"))
-;; 	 (cache-dir  (common:get-create-writeable-dir
-;; 		      (list (conc "/tmp/" (current-user-name) "/" cname-part)
-;; 			    (conc "/tmp/" (current-user-name) "-" cname-part)
-;; 			     (conc "/tmp/" (current-user-name) "_" cname-part))))
-;; 	 (megatest-db (conc *toppath* "/megatest.db")))
-;;     ;; (debug:print-info 0 *default-log-port* "Using cache dir " cache-dir)
-;;     (if (not cache-dir)
-;; 	(begin
-;; 	  (debug:print 0 *default-log-port* "ERROR: Failed to find an area to write the cache db")
-;; 	  (exit 1))
-;; 	(let* ((th1      (make-thread
-;; 			  (lambda ()
-;; 			    (if (and (common:file-exists? megatest-db)
-;; 				     (file-write-access? megatest-db))
-;; 				(begin
-;; 				  (db:sync-to-megatest.db dbstruct 'timestamps) ;; internally mutexes on *db-local-sync*
-;; 				  (debug:print-info 2 *default-log-port* "Done syncing to megatest.db")))) 
-;; 			  "call-with-cached-db sync-to-megatest.db"))
-;; 	       (cache-db (db:cache-for-read-only
-;; 			  megatest-db
-;; 			  (conc cache-dir "/" fname)
-;; 			  use-last-update: #t)))
-;; 	  (thread-start! th1)
-;; 	  (apply proc cache-db params)
-;; 	  ))))
-
-
-
-
-(define (db:all-db-sync dbstruct)
-  (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db))
-	 (data-synced       0) ;; count of changed records
-    (tmp-area       (common:get-db-tmp-area))
-    (dbfiles        (glob (conc tmp-area"/.megatest/*.db")))
-    (sync-durations (make-hash-table))
-    (no-sync-db        (db:open-no-sync-db)))
+(define (db:get-sqlite3-mod-time fname)
+  (let* ((wal-file (conc fname "-wal"))
+	 (shm-file (conc fname "-shm"))
+	 (get-mtime (lambda (f)
+		      (if (and (file-exists? f)
+			       (file-read-access? f))
+			  (file-modification-time f)
+			  0))))
+    (max (get-mtime fname)
+	 (get-mtime wal-file)
+	 (get-mtime shm-file))))
+	 
+;; (define (db:all-db-sync dbstruct)
+;;   (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db))
+;; 	 (data-synced       0) ;; count of changed records
+;;     (tmp-area       (common:get-db-tmp-area))
+;;     (dbfiles        (glob (conc tmp-area"/.mtdb/*.db")))
+;;     (sync-durations (make-hash-table))
+;;     (no-sync-db        (db:open-no-sync-db)))
+;;     (for-each
+;;      (lambda (file) ;; tmp db file
+;;        (debug:print-info 3 *default-log-port* "file: " file)
+;;        (let* ((fname       (conc (pathname-file file) ".db")) ;; fname is tmp db file
+;;               (wal-file (conc fname "-wal"))
+;;               (shm-file (conc fname "-shm"))
+;; 	      (fulln       (conc *toppath*"/,mtdb/"fname)) ;; fulln is nfs db name
+;;               (wal-time     (if (file-exists? wal-file)             
+;; 			       (file-modification-time wal-file)
+;;                                0))
+;;               (shm-time     (if (file-exists? shm-file)             
+;; 			       (file-modification-time shm-file)
+;;                                0))
+;; 	      (time1        (db:get-sqlite3-mod-time file))
+;; ;;	       (if (file-exists? file)              ;; time1 is the max itime of the tmp db, -wal and -shm files.
+;; ;;			       (max (file-modification-time file) wal-time shm-time)
+;; ;;			       (begin
+;; ;;				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "file)
+;; ;;			   1)))
+;; 	      (time2       (db:get-sqlite3-mod-time fulln))
+;; ;;	      (if (file-exists? fulln)             ;; time2 is nfs file time
+;; ;;			       (file-modification-time fulln)
+;; ;;			       (begin
+;; ;;				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln)
+;; ;;				 0)))
+;; 	      (changed      (> (- time1 time2) (+ (random 5) 1)))  ;; it has been at some few seconds since last synced
+;; 	      (changed10    (> (- time1 time2) 10)) ;; it has been at least ten seconds since sync'd
+;; 	      (jfile-exists (file-exists? (conc file"-journal"))) ;; i.e. are we busy?
+;; 	      (do-cp        (cond
+;; 			     ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover
+;; 			      (cons #t (conc "File "fulln" not found! Copying "fname" to "fulln)))
+;; 			     ((and (not jfile-exists) changed)
+;; 			      (cons #t "not busy, changed")) ;; not busy and changed
+;; 			     ((and jfile-exists changed10)
+;; 			      (cons #t "busy but not synced in a while")) ;; busy but not sync'd in over 10 seconds
+;; 			     ((and changed *time-to-exit*)
+;; 			      (cons #t "Time to exit, forced final sync")) ;; last sync
+;; 			     (else
+;; 			      (cons #f "No sync needed")))))
+;; 	 (if (car do-cp)
+;; 	     (let* ((start-time (current-milliseconds))
+;; 		    (fname (pathname-file file))
+;; 		    (runid (if (string= fname "main") #f (string->number fname))))
+;; 	       (debug:print-info 3 *default-log-port* "db:all-db-sync: fname: "
+;; 				 fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp))
+;; 	       (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db)
+;; 	       (hash-table-set! sync-durations (conc fname".db")
+;; 				(- (current-milliseconds) start-time)))
+;; 	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
+;;          )))
+;;      dbfiles)
+;;     ;; WHY does the dbdat need to be added back?
+;;     (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))
+;;     )
+;;   #t)
+
+(define (db:kill-servers)
+  (let* ((servers (server:choose-server *toppath* 'all-valid))) ;; (server:get-list *toppath*))
     (for-each
-     (lambda (file)
-       (debug:print-info 3 *default-log-port* "file: " file)
-       (let* ((fname       (conc (pathname-file file) ".db"))
-	      (fulln       (conc *toppath*"/.megatest/"fname))
-	      (time1       (if (file-exists? file)
-			       (file-modification-time file)
-			       (begin
-				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "file)
-			   1)))
-	      (time2       (if (file-exists? fulln)
-			       (file-modification-time fulln)
-			       (begin
-				 (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln)
-				 0)))
-	      (changed      (> (- time1 time2) (+ (random 5) 1)))  ;; it has been at some few seconds since last synced
-	      (changed10    (> (- time1 time2) 10)) ;; it has been at least ten seconds since sync'd
-	      (jfile-exists (file-exists? (conc file"-journal"))) ;; i.e. are we busy?
-	      (do-cp        (cond
-			     ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover
-			      (cons #t (conc "File "fulln" not found! Copying "fname" to "fulln)))
-			     ((and (not jfile-exists) changed)
-			      (cons #t "not busy, changed")) ;; not busy and changed
-			     ((and jfile-exists changed10)
-			      (cons #t "busy but not synced in a while")) ;; busy but not sync'd in over 10 seconds
-			     ((and changed *time-to-exit*)
-			      (cons #t "Time to exit, forced final sync")) ;; last sync
-			     (else
-			      (cons #f "No sync needed")))))
-	 (if (car do-cp)
-	     (let* ((start-time (current-milliseconds))
-		    (fname (pathname-file file))
-		    (runid (if (string= fname "main") #f (string->number fname))))
-	       (debug:print-info 3 *default-log-port* "db:all-db-sync: fname: "
-				 fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp))
-	       (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db)
-	       (hash-table-set! sync-durations (conc fname".db")
-				(- (current-milliseconds) start-time)))
-	     (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date")
-         )))
-     dbfiles)
-    (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)))
-  #t)
+     (lambda (server)
+       (handle-exceptions
+           exn
+         (begin 
+           (debug:print-info 0 *default-log-port*  "Unable to get host and/or port from " server ", exn=" exn)     
+           #f)
+	 (match-let (((mod-time host port start-time server-id pid) server))
+		    (if (and host pid)
+			(tasks:kill-server host pid)))))
+     servers)
+    (delete-file* (common:get-sync-lock-filepath))))
 
 ;; options:
 ;;
 ;;  'killservers  - kills all servers
 ;;  'dejunk       - removes junk records
@@ -473,117 +556,97 @@
 ;;  run-ids: '(1 2 3 ...) or #f (for all)
 ;;
 (define (db:multi-db-sync dbstruct . options)
   (let* (;; (dbdat       (db:open-db dbstruct #f dbfile:db-init-proc))
 	 (data-synced 0) ;; count of changed records
-    (tmp-area       (common:get-db-tmp-area))
-    (old2new (member 'old2new options))
-    (dejunk (member 'dejunk options))
-    (killservers (member 'killservers options))
-    (servers (server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath*))
-    (src-area (if old2new *toppath* tmp-area))
-    (dest-area (if old2new tmp-area *toppath*))
-    (dbfiles        (if old2new (glob (conc *toppath* "/.megatest/*.db")) (glob (conc tmp-area "/.megatest/*.db"))))
-    (keys (db:get-keys dbstruct))
-    (sync-durations (make-hash-table)))
-
-
-    (if killservers
-      (begin
-       	  (for-each
-	   (lambda (server)
-             (handle-exceptions
-             exn
-             (begin 
-               (debug:print-info 0 *default-log-port*  "Unable to get host and/or port from " server ", exn=" exn)     
-               #f)
-	     (match-let (((mod-time host port start-time server-id pid) server))
-	       (if (and host pid)
-		   (tasks:kill-server host pid)))))
-	   servers)
-          (delete-file* (common:get-sync-lock-filepath))
-      )
-    )
-    (for-each
-     (lambda (srcfile)
-       (debug:print-info 3 *default-log-port* "file: " srcfile)
-       (let* ((fname (conc (pathname-file srcfile) ".db"))
-              (basename (pathname-file srcfile))
-              (run-id (if (string= basename "main") #f (string->number basename)))
-	      (destfile (conc dest-area "/.megatest/" fname))
-              (dest-directory  (conc dest-area "/.megatest/"))
-              (dummy (debug:print-info 2 *default-log-port* "destfile = " destfile))
-              (dummy2 (debug:print-info 2 *default-log-port* "dejunk = " dejunk))
-	      (time1 (file-modification-time srcfile))
-              (time2 (if (file-exists? destfile)
-                         (begin
-                            (debug:print-info 2 *default-log-port* "destfile " destfile " exists")
-			    (file-modification-time destfile)
-                         )
-			 (begin
-			   (debug:print-info 0 *default-log-port* "Sync - I do not see file " destfile)
-			   0)))
-	      (changed ( < (- time2 time1) 6.0)) ;; dest db not updated within last 6 seconds
-
-      (do-cp (cond
-		      ((not (file-exists? destfile)) ;; shouldn't happen, but this might recover
-		       (debug:print-info 2 *default-log-port* "File " destfile " not found. Copying "srcfile" to "destfile)
-                       (system (conc "/bin/mkdir -p " dest-directory))
-                       (system (conc "/bin/cp " srcfile " " destfile))
-		       #t)
-		      (changed ;; (and changed
-		       ;; (> (- (current-seconds) time1) 3)) ;; if file is changed and three seconds have passed.
-		       #t)
-		      ((and changed *time-to-exit*) ;; last sync
-		       #t)
-		      (else
-		       #f))))
-          (if (or dejunk do-cp)
-	     (let* (
-                    (start-time (current-milliseconds))
-
-                    (subdb (or (dbfile:get-subdb dbstruct run-id) (dbfile:init-subdb dbstruct run-id dbfile:db-init-proc)))
-                    (mtdb      (dbr:subdb-mtdbdat subdb))
-                    (tmpdb     (dbfile:open-db dbstruct run-id dbfile:db-init-proc))
-
-                    )
-	       (debug:print-info 2 *default-log-port* "delta syncing file: " srcfile ", time diff: " (- time1 time2) " seconds")
-
-               (if old2new
-                 (begin
-                   (if dejunk (db:clean-up run-id mtdb))
-		   (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) #f mtdb tmpdb)
-                 )
-                 (begin
-                   (if dejunk (db:clean-up run-id tmpdb))
-		   (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) #f tmpdb mtdb)
-                 )
-               )
-	       (hash-table-set! sync-durations (conc srcfile ".db") (- (current-milliseconds) start-time)))
-	     (debug:print-info 2 *default-log-port* "skipping delta sync. " srcfile " is up to date")
-          )
-       )
-     )
-     dbfiles
-    )
-    data-synced
-  )
-)
+	 (tmp-area       (common:get-db-tmp-area))
+	 (old2new (member 'old2new options))
+	 (dejunk (member 'dejunk options))
+	 (killservers (member 'killservers options))
+	 (src-area (if old2new *toppath* tmp-area))
+	 (dest-area (if old2new tmp-area *toppath*))
+	 (dbfiles        (if old2new (glob (conc *toppath* "/.mtdb/*.db")) (glob (conc tmp-area "/.mtdt/*.db"))))
+	 (keys (db:get-keys dbstruct))
+	 (sync-durations (make-hash-table)))
+
+    ;; kill servers
+    (if killservers (db:kill-servers))
+    
+    (if (not dbfiles)
+	(debug:print-error 0 *default-log-port* "no dbfiles found in " (conc *toppath* "/.mtdb"))
+	(for-each
+	 (lambda (srcfile)
+	   (debug:print-info 3 *default-log-port* "file: " srcfile)
+	   (let* ((fname    (conc (pathname-file srcfile) ".db"))
+		  (basename (pathname-file srcfile))
+		  (run-id   (if (string= basename "main") #f (string->number basename)))
+		  (destfile (conc dest-area "/.mtdb/" fname))
+		  (dest-directory  (conc dest-area "/.mtdb/"))
+		  (time1    (file-modification-time srcfile))
+		  (time2    (if (file-exists? destfile)
+				(begin
+				  (debug:print-info 2 *default-log-port* "destfile " destfile " exists")
+				  (file-modification-time destfile))
+				(begin
+				  (debug:print-info 0 *default-log-port* "Sync - I do not see file " destfile)
+				  0)))
+		  (changed   ( < (- time2 time1) 6.0)) ;; dest db not updated within last 6 seconds
+
+		  (do-cp     (cond
+			      ((not (file-exists? destfile)) ;; shouldn't happen, but this might recover
+			       (debug:print-info 2 *default-log-port* "File " destfile " not found. Copying "srcfile" to "destfile)
+			       ;; TODO: Need to fix this for WAL mod. Can't just copy.
+			       (system (conc "/bin/mkdir -p " dest-directory))
+			       (system (conc "/bin/cp " srcfile " " destfile))
+			       #t)
+			      (changed ;; (and changed
+			       #t)
+			      ((and changed *time-to-exit*) ;; last sync
+			       #t)
+			      (else
+			       #f))))
+             (if (or dejunk do-cp)
+		 (let* ((start-time (current-milliseconds))
+			;; subdb is misnamed - should be dbdat (I think...)
+			(subdb    (dbfile:open-db dbstruct run-id dbfile:db-init-proc))
+			;;        (or (dbfile:get-subdb dbstruct run-id)
+			;;            (dbfile:init-subdb dbstruct run-id dbfile:db-init-proc)))
+			(mtdb      (dbr:subdb-mtdbdat subdb))
+			;;
+			;; BUG: -mrw- I think this next line is wrong. run-id should be the path to .mtdb/<runid>.db
+			;; 
+			(tmpdb     (dbfile:open-db dbstruct run-id dbfile:db-init-proc)))
+		   
+		   (debug:print-info 2 *default-log-port* "delta syncing file: " srcfile ", time diff: " (- time1 time2) " seconds")
+		   (if old2new
+                       (begin
+			 (if dejunk (db:clean-up run-id mtdb))
+			 (db:sync-tables (db:sync-all-tables-list
+					  dbstruct
+					  (db:get-keys dbstruct))
+					 #f mtdb tmpdb))
+                       (begin
+			 (if dejunk (db:clean-up run-id tmpdb))
+			 (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) #f tmpdb mtdb)))
+		   (hash-table-set! sync-durations (conc srcfile ".db") (- (current-milliseconds) start-time)))
+		 (debug:print-info 2 *default-log-port* "skipping delta sync. " srcfile " is up to date"))))
+	 dbfiles))
+    data-synced))
 
 ;; Sync all changed db's
 ;;
 (define (db:tmp->megatest.db-sync dbstruct run-id last-update)
   (let* ((subdbs (hash-table-values (dbr:dbstruct-subdbs dbstruct)))
 	 (res    '()))
     (for-each
      (lambda (subdb)
-       (let* ((dbname (db:run-id->dbname run-id))
-	      (mtdb   (dbr:subdb-mtdb subdb))
+       (let* ((mtdb   (dbr:subdb-mtdb subdb))
 	      (tmpdb  (db:get-subdb dbstruct run-id))
 	      (refndb (dbr:subdb-refndb subdb))
 	      (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb)))
 	 ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb)
+	 ;; BUG: verify this is really needed
 	 (dbfile:add-dbdat dbstruct run-id tmpdb)
 	 (set! res (cons newres res))))
      subdbs)
     res))
 
@@ -648,13 +711,23 @@
       ;; exn
       ;; (begin
       ;;   (debug:print 0 "ERROR: Failed to create tables. Look at your [fields] section, should be: fieldname TEXT DEFAULT 'yourdefault'")
       ;;   (exit))
 	(sqlite3:execute db "CREATE TABLE IF NOT EXISTS keys (id INTEGER PRIMARY KEY, fieldname TEXT, fieldtype TEXT, CONSTRAINT keyconstraint UNIQUE (fieldname));")
-	(for-each (lambda (key)
-		    (sqlite3:execute db "INSERT OR REPLACE INTO keys (fieldname,fieldtype) VALUES (?,?);" key "TEXT"))
-		  keys)
+	(for-each
+	 (lambda (key)
+	   (let* ((fieldname #f)
+		  (fieldtype #f))
+	     (sqlite3:for-each-row
+	      (lambda (fn ft)
+		(set! fieldname fn)
+		(set! fieldtype ft))
+	      db
+	      "SELECT fieldname,fieldtype FROM keys WHERE fieldname=?" key)
+	     (if (not fieldname)
+		 (sqlite3:execute db "INSERT INTO keys (fieldname,fieldtype) VALUES (?,?);" key "TEXT"))))
+	 keys)
 	(sqlite3:execute db (conc 
 			     "CREATE TABLE IF NOT EXISTS runs (id INTEGER PRIMARY KEY, \n			 " 
 			     fieldstr (if havekeys "," "") "
 			 runname    TEXT DEFAULT 'norun',
                          contour    TEXT DEFAULT '',
@@ -745,11 +818,21 @@
 	(sqlite3:execute db "CREATE TABLE IF NOT EXISTS metadat (id INTEGER PRIMARY KEY, var TEXT, val TEXT,
                                   CONSTRAINT metadat_constraint UNIQUE (var));")
 	(sqlite3:execute db "CREATE TABLE IF NOT EXISTS access_log (id INTEGER PRIMARY KEY, user TEXT, accessed TIMESTAMP, args TEXT);")
 	;; Must do this *after* running patch db !! No more. 
 	;; cannot use db:set-var since it will deadlock, hardwire the code here
-	(sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" "MEGATEST_VERSION" (common:version-signature))
+	(let* ((prev-version #f)
+	       (curr-version (common:version-signature)))
+	  (sqlite3:for-each-row
+	   (lambda (ver)
+	     (set! prev-version ver))
+	   db
+	   "SELECT val FROM metadat WHERE var='MEGATEST_VERSION';")
+	  (if prev-version
+	      (if (not (equal? prev-version curr-version))
+		  (sqlite3:execute db "UPDATE metadat SET val=? WHERE var=?;" curr-version "MEGATEST_VERSION"))
+	      (sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" "MEGATEST_VERSION" curr-version) ))
 	(debug:print-info 11 *default-log-port* "db:initialize END") ;; ))))
 
 	;;======================================================================
 	;; R U N   S P E C I F I C   D B 
 	;;======================================================================
@@ -840,19 +923,21 @@
                               test_id      INTEGER,
                               update_time  TIMESTAMP,
                               cpuload      INTEGER DEFAULT -1,
                               diskfree     INTEGER DEFAULT -1,
                               diskusage    INTGER DEFAULT -1,
-                              run_duration INTEGER DEFAULT 0);")
+                              run_duration INTEGER DEFAULT 0,
+ last_update INTEGER DEFAULT (strftime('%s','now')));")
 	(sqlite3:execute db "CREATE TABLE IF NOT EXISTS archives (
                               id           INTEGER PRIMARY KEY,
                               test_id      INTEGER,
                               state        TEXT DEFAULT 'new',
                               status       TEXT DEFAULT 'n/a',
                               archive_type TEXT DEFAULT 'bup',
                               du           INTEGER,
-                              archive_path TEXT);")))
+                              archive_path TEXT,
+ last_update INTEGER DEFAULT (strftime('%s','now')));")))
         (db:create-triggers db)    
      db)) ;; )
 
 ;;======================================================================
 ;; A R C H I V E S
@@ -886,10 +971,11 @@
 	  "SELECT d.id,d.archive_area_name,disk_path,last_df,last_df_time FROM archive_disks AS d
              INNER JOIN archive_blocks AS b ON d.id=b.archive_disk_id
              WHERE b.id IN (" (string-intersperse (map conc res) ",") ") AND
          last_df > ?;")
 	 dneeded))
+    ;; BUG: Verfify this is really needed
     (dbfile:add-dbdat dbstruct #f dbdat)
     blocks))
     
 ;; returns id of the record, register a disk allocated to archiving and record it's last known
 ;; available space
@@ -952,11 +1038,11 @@
 ;;
 (define (db:test-set-archive-block-id dbstruct run-id test-id archive-block-id)
   (db:with-db
    dbstruct
    run-id
-   #f
+   #t
    (lambda (dbdat db)
      (sqlite3:execute db "UPDATE tests SET archived=? WHERE id=?;"
 		      archive-block-id test-id))))
  
 ;; Look up the archive block info given a block-id
@@ -990,220 +1076,64 @@
 
 ;;======================================================================
 ;; M A I N T E N A N C E
 ;;======================================================================
 
-(define (db:have-incompletes? dbstruct run-id ovr-deadtime)
-  (let* ((incompleted '())
-	 (oldlaunched '())
-	 (toplevels   '())
-	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
-	 (deadtime     (if (and deadtime-str
-				(string->number deadtime-str))
-			   (string->number deadtime-str)
-			   72000))) ;; twenty hours
-    (db:with-db
-     dbstruct run-id #f
-     (lambda (dbdat db)
-       (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
-       
-       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
-       ;;
-       ;; HOWEVER: this code in run:test seems to work fine
-       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
-       ;;                     (db:test-get-run_duration testdat)))
-       ;;                    600) 
-       ;; (db:delay-if-busy dbdat)
-       (sqlite3:for-each-row 
-        (lambda (test-id run-dir uname testname item-path)
-          (if (and (equal? uname "n/a")
-                   (equal? item-path "")) ;; this is a toplevel test
-              ;; what to do with toplevel? call rollup?
-              (begin
-                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-              (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
-        db
-        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');"
-        run-id deadtime)
-
-       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
-       ;;
-       ;; (db:delay-if-busy dbdat)
-       (sqlite3:for-each-row
-        (lambda (test-id run-dir uname testname item-path)
-          (if (and (equal? uname "n/a")
-                   (equal? item-path "")) ;; this is a toplevel test
-              ;; what to do with toplevel? call rollup?
-              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-              (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
-        db
-        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
-        run-id)
-       
-       (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
-       (if (and (null? incompleted)
-                (null? oldlaunched)
-                (null? toplevels))
-           #f
-           #t)))))
-
-(define (db:get-status-from-final-status-file run-dir)
-  (let ((infile (conc run-dir "/.final-status")))
-    ;; first verify we are able to write the output file
-    (if (not (file-read-access? infile))
-        (begin 
-	  (debug:print 2 *default-log-port* "ERROR: cannot read " infile)
-          (debug:print 2 *default-log-port* "ERROR: run-dir is " run-dir)
-          #f
-          )
-        (with-input-from-file infile read-lines)
-	)))
-
-;;  select end_time-now from
-;;      (select testname,item_path,event_time+run_duration as
-;;                          end_time,strftime('%s','now') as now from tests where state in
-;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
-
-(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
-  (let* ((incompleted '())
-	 (oldlaunched '())
-	 (toplevels   '())
-          ;; The default running-deadtime is 720 seconds = 12 minutes.
-          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
-         (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
-         (server-start-allowance 200)
-         (server-overloaded-budget 200)
-         (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
-         (launch-monitor-on-time-budget 30)
-         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
-         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
-         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
-         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
-         (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
-         )
-    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
-    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)
-
-    (db:with-db 
-     dbstruct run-id #f
-     (lambda (dbdat db)
-       (let* ((stmth1 (db:get-cache-stmth
-		       dbdat run-id db
-		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
-                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
-                                          AND state IN ('RUNNING');"))
-	      (stmth2 (db:get-cache-stmth
-		       dbdat run-id db
-		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
-                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
-                                          AND state IN ('REMOTEHOSTSTART');"))
-	      (stmth3 (db:get-cache-stmth
-		       dbdat run-id db
-		       "SELECT id,rundir,uname,testname,item_path FROM tests
-                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
-                                          AND state IN ('LAUNCHED');")))
-	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
-	 ;;
-	 ;; HOWEVER: this code in run:test seems to work fine
-	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
-	 ;;                     (db:test-get-run_duration testdat)))
-	 ;;                    600) 
-	 ;; (db:delay-if-busy dbdat)
-	 (sqlite3:for-each-row 
-	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
-	    (if (and (equal? uname "n/a")
-		     (equal? item-path "")) ;; this is a toplevel test
-		;; what to do with toplevel? call rollup?
-		(begin
-		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-		(begin
-		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
-		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
-				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
-				    " event-time="event-time" run-duration="run-duration))))
-	  stmth1
-	  run-id running-deadtime) ;; default time 720 seconds
-       
-	 (sqlite3:for-each-row 
-	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
-	    (if (and (equal? uname "n/a")
-		     (equal? item-path "")) ;; this is a toplevel test
-		;; what to do with toplevel? call rollup?
-		(begin
-		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-		(begin
-		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
-				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
-				    " run-duration="run-duration)
-		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
-	  stmth2
-	  run-id remotehoststart-deadtime) ;; default time 230 seconds
-	 
-	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
-	 ;;
-	 ;; (db:delay-if-busy dbdat)
-	 (sqlite3:for-each-row
-	  (lambda (test-id run-dir uname testname item-path)
-	    (if (and (equal? uname "n/a")
-		     (equal? item-path "")) ;; this is a toplevel test
-		;; what to do with toplevel? call rollup?
-		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-		(begin
-		  (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
-				    " 1 day since event_time marked")
-                (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
-	  stmth3
-	  run-id)
-       
-	 (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
-			   (length toplevels) " old LAUNCHED toplevel tests and "
-			   (length incompleted) " tests marked RUNNING but apparently dead."))
-
-	 ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
-       ;;
-	 ;; (db:delay-if-busy dbdat)
-	 (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
-		(all-ids             (append min-incompleted-ids (map car oldlaunched))))
-	   (if (> (length all-ids) 0)
-	       (begin
-		 ;; (launch:is-test-alive "localhost" 435)
-		 (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
-			      " as DEAD")
-		 (for-each
-                  (lambda (test-id)
-                    (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
-			   (tinfo   (db:get-test-info-by-id dbstruct run-id test-id))
-			   (run-dir (db:test-get-rundir     tinfo))
-			   (host    (db:test-get-host       tinfo))
-			   (pid     (db:test-get-process_id tinfo))
-			   (result (db:get-status-from-final-status-file run-dir)))
-		      (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
-			  (begin
-			    (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
-			    (db:set-state-status-and-roll-up-items
-			     dbstruct run-id test-id 'foo "COMPLETED" "PASS"
-			     "Test stopped responding but it has PASSED; marking it PASS in the DB."))
-			  (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
-					       (launch:is-test-alive host pid))))
-			    (if is-alive
-				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
-					     " has a process on pid " pid ", NOT setting to DEAD.")
-				(begin
-				  (debug:print 0 *default-log-port* "INFO: test " test-id
-					       " final state/status is not COMPLETED/PASS. It is " result)
-				  (db:set-state-status-and-roll-up-items
-				   dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
-				   "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
-		  ;; call end of eud of run detection for posthook - from merge, is it needed?
-		  ;; (launch:end-of-run-check run-id)
-		  all-ids)
-		 ;;call end of eud of run detection for posthook
-		 (launch:end-of-run-check run-id)
-		 )))))))
+;; (define (db:have-incompletes? dbstruct run-id ovr-deadtime)
+;;   (let* ((incompleted '())
+;; 	 (oldlaunched '())
+;; 	 (toplevels   '())
+;; 	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
+;; 	 (deadtime     (if (and deadtime-str
+;; 				(string->number deadtime-str))
+;; 			   (string->number deadtime-str)
+;; 			   72000))) ;; twenty hours
+;;     (db:with-db
+;;      dbstruct run-id #f
+;;      (lambda (dbdat db)
+;;        (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
+;;        
+;;        ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
+;;        ;;
+;;        ;; HOWEVER: this code in run:test seems to work fine
+;;        ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
+;;        ;;                     (db:test-get-run_duration testdat)))
+;;        ;;                    600) 
+;;        ;; (db:delay-if-busy dbdat)
+;;        (sqlite3:for-each-row 
+;;         (lambda (test-id run-dir uname testname item-path)
+;;           (if (and (equal? uname "n/a")
+;;                    (equal? item-path "")) ;; this is a toplevel test
+;;               ;; what to do with toplevel? call rollup?
+;;               (begin
+;;                 (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+;;                 (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+;;               (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
+;;         (db:get-cache-stmth dbdat db
+;;         "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');")
+;;         run-id deadtime)
+;; 
+;;        ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
+;;        ;;
+;;        ;; (db:delay-if-busy dbdat)
+;;        (sqlite3:for-each-row
+;;         (lambda (test-id run-dir uname testname item-path)
+;;           (if (and (equal? uname "n/a")
+;;                    (equal? item-path "")) ;; this is a toplevel test
+;;               ;; what to do with toplevel? call rollup?
+;;               (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+;;               (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
+;;         (db:get-cache-stmth dbdat db
+;;         "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');")
+;;         run-id)
+;;        
+;;        (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
+;;        (if (and (null? incompleted)
+;;                 (null? oldlaunched)
+;;                 (null? toplevels))
+;;            #f
+;;            #t)))))
 
 ;; BUG: Probably broken - does not explicitly use run-id in the query
 ;;
 (define (db:top-test-set-per-pf-counts dbstruct run-id test-name)
   (db:general-call dbstruct run-id 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name)))
@@ -1317,141 +1247,56 @@
     ;; (db:find-and-mark-incomplete db)
     ;; (db:delay-if-busy dbdat)
     (sqlite3:execute db "VACUUM;")
     dead-runs))
 
-;;======================================================================
-;; M E T A   G E T   A N D   S E T   V A R S
-;;======================================================================
-
-;; returns number if string->number is successful, string otherwise
-;; also updates *global-delta*
-;;
-(define (db:get-var dbstruct var)
-  (let* ((res      #f))
-    (db:with-db
-     dbstruct #f #f  ;; for the moment vars are only stored in main.db
-     (lambda (dbdat db)
-       (sqlite3:for-each-row
-        (lambda (val)
-          (set! res val))
-        db
-        "SELECT val FROM metadat WHERE var=?;" var)
-       ;; convert to number if can
-       (if (string? res)
-           (let ((valnum (string->number res)))
-             (if valnum (set! res valnum))))
-       res))))
-
-(define (db:inc-var dbstruct var)
-  (db:with-db dbstruct #f #t 
-	      (lambda (dbdat db)
-		(sqlite3:execute db "UPDATE metadat SET val=val+1 WHERE var=?;" var))))
-
-(define (db:dec-var dbstruct var)
-  (db:with-db dbstruct #f #t 
-	      (lambda (dbdat db)
-		(sqlite3:execute db "UPDATE metadat SET val=val-1 WHERE var=?;" var))))
-
-;; This was part of db:get-var. It was used to estimate the load on
-;; the database files.
-;;
-;; scale by 10, average with current value.
-;;     (set! *global-delta* (/ (+ *global-delta* (* (- (current-milliseconds) start-ms)
-;; 						 (if throttle throttle 0.01)))
-;; 			    2))
-;;     (if (> (abs (- *last-global-delta-printed* *global-delta*)) 0.08) ;; don't print all the time, only if it changes a bit
-;; 	(begin
-;; 	  (debug:print-info 4 *default-log-port* "launch throttle factor=" *global-delta*)
-;; 	  (set! *last-global-delta-printed* *global-delta*)))
-
-(define (db:set-var dbstruct var val)
-  (db:with-db dbstruct #f #t 
-	      (lambda (dbdat db)
-		(sqlite3:execute db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);" var val))))
-
-(define (db:add-var dbstruct var val)
-  (db:with-db dbstruct #f #t 
-	      (lambda (dbdat db)
-		(sqlite3:execute db "UPDATE metadat SET val=val+? WHERE var=?;" val var))))
-
-(define (db:del-var dbstruct var)
-  (db:with-db dbstruct #f #t 
-	      (lambda (dbdat db)
-		(sqlite3:execute db "DELETE FROM metadat WHERE var=?;" var))))
-
 ;;======================================================================
 ;; no-sync.db - small bits of data to be shared between servers
 ;;======================================================================
 
-(define (db:no-sync-db db-in)
-  (if db-in
-      db-in
-      (if *no-sync-db*
-	  *no-sync-db*
-	  (begin
-	    (mutex-lock! *db-access-mutex*)
-	    (let ((dbpath (common:get-db-tmp-area))
-		  (db     (dbfile:open-no-sync-db dbpath)))
-	      (set! *no-sync-db* db)
-	      (mutex-unlock! *db-access-mutex*)
-	      db)))))
-
-(define (with-no-sync-db proc)
-  (let* ((db  (db:no-sync-db *no-sync-db*)))
-    (proc db)))
-
+(define (db:get-dbsync-path)
+  (case (rmt:transport-mode)
+    ((http)(common:get-db-tmp-area))
+    ((tcp) (conc *toppath*"/.mtdb"))
+    ((nfs) (conc *toppath*"/.mtdb"))
+    (else "/tmp/dunno-this-gonna-exist")))
+
+;; This is needed for api.scm
 (define (db:open-no-sync-db)
-  (dbfile:open-no-sync-db (db:dbfile-path)))
-
-(define (db:no-sync-close-db db stmt-cache)
-  (db:safely-close-sqlite3-db db stmt-cache))
-
-
-;; use a global for some primitive caching, it is just silly to
-;; re-read the db over and over again for the keys since they never
-;; change
-
+   (dbfile:open-no-sync-db (db:get-dbsync-path)))
+ 
 ;; why get the keys from the db? why not get from the *configdat*
 ;; using keys:config-get-fields?
 
 (define (db:get-keys dbstruct)
-  (keys:config-get-fields *configdat*)
-)
-
-;;  (if *db-keys* *db-keys* 
-;;      (let ((res '()))
-;;	(db:with-db dbstruct #f #f
-;;		    (lambda (dbdat db)
-;;		      (sqlite3:for-each-row 
-;;		       (lambda (key)
-;;			 (set! res (cons key res)))
-;;		       db
-;;		       "SELECT fieldname FROM keys ORDER BY id DESC;")))
-;;	(set! *db-keys* res)
-;;	res)))
+  (keys:config-get-fields *configdat*))
 
 ;; extract index number given a header/data structure
 (define (db:get-index-by-header header field)
   (list-index (lambda (x)(equal? x field)) header))
 
 ;; look up values in a header/data structure
 (define (db:get-value-by-header row header field)
-  (if (or (null? header) (not row))
-      #f
-      (let loop ((hed (car header))
-                 (tal (cdr header))
-                 (n   0))
-        (if (equal? hed field)
-            (handle-exceptions
-             exn
-             (begin
-               (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row="
-			    row " header=" header " field=" field ", exn=" exn)
-               #f)
-             (vector-ref row n))
-	    (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1)))))))
+  (let ((len (if (vector? row)
+		 (vector-length row)
+		 0)))
+    (if (or (null? header) (not row))
+	#f
+	(let loop ((hed (car header))
+		   (tal (cdr header))
+		   (n   0))
+	  (if (equal? hed field)
+	      (handle-exceptions
+	       exn
+	       (begin
+		 (debug:print 4 *default-log-port* "WARNING: attempt to read non-existant field, row="
+			      row " header=" header " field=" field ", exn=" exn)
+		 #f)
+	       (if (>= n len)
+		   #f
+		   (vector-ref row n)))
+	      (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1))))))))
 
 ;; Accessors for the header/data structure
 ;; get rows and header from 
 (define (db:get-header vec)(vector-ref vec 0))
 (define (db:get-rows   vec)(vector-ref vec 1))
@@ -1458,33 +1303,26 @@
 
 ;;======================================================================
 ;;  R U N S
 ;;======================================================================
 
-
-
-
-
 (define (db:get-run-times dbstruct run-patt target-patt)
 (let ((res `())
-           (qry 	(conc "select runname, (max(end_time)-min(event_time))/60 as runtime, target from (select runname, run_id,tests.event_time,tests.event_time+run_duration AS end_time, " (string-join (db:get-keys dbstruct) " || '/' || ") " as target from tests inner join runs on tests.run_id = runs.id where runs.runname like ? and target like ?) group by run_id ;")))
-;(print qry)
-(db:with-db 
+      (qry 	(conc "select runname, (max(end_time)-min(event_time))/60 as runtime, target from (select runname, run_id,tests.event_time,tests.event_time+run_duration AS end_time, " (string-join (db:get-keys dbstruct) " || '/' || ") " as target from tests inner join runs on tests.run_id = runs.id where runs.runname like ? and target like ?) group by run_id ;")))
+					;(print qry)
+  (db:with-db 
    dbstruct
    #f ;; this is for the main runs db
    #f ;; does not modify db
    (lambda (dbdat db)
-            (sqlite3:for-each-row
-	(lambda (runname runtime target )
-	  (set! res (cons (vector runname runtime target) res)))
-	db
-        qry 
-	run-patt target-patt)
-       
-       res))))
-
-
+     (sqlite3:for-each-row
+      (lambda (runname runtime target )
+	(set! res (cons (vector runname runtime target) res)))
+      db
+      qry 
+      run-patt target-patt)
+     res))))
 
 (define (db:get-run-name-from-id dbstruct run-id)
   (db:with-db 
    dbstruct
    #f ;; this is for the main runs db
@@ -1551,11 +1389,11 @@
     ;; (debug:print 0 *default-log-port* "Got here 0.")
     (debug:print 3 *default-log-port* "keys: " keys " allvals: " allvals " keyvals: " keyvals " key=?str is " key=?str)
     (debug:print 2 *default-log-port* "NOTE: using target " (string-intersperse (map cadr keyvals) "/") " for this run")
     (if (and runname (null? (filter (lambda (x)(not x)) keyvals))) ;; there must be a better way to "apply and"
 	(db:with-db
-	 dbstruct #f #f
+	 dbstruct #f #t
 	 (lambda (dbdat db)
 	   ;; (debug:print 0 *default-log-port* "Got here 1.")
 	   (let ((res #f))
 	     (apply sqlite3:execute db
 		    (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour"
@@ -1572,10 +1410,92 @@
 	     res))) 
 	(begin
 	  (debug:print-error 0 *default-log-port* "Called without all necessary keys")
 	  #f))))
 
+(define (db:get-run-id dbstruct runname target)
+  (let ((runs (db:simple-get-runs dbstruct runname #f #f target #f))) ;; runpatt count offset target last-update
+    (if (null? runs)
+	#f
+	(simple-run-id (car runs)))))
+
+;; called with run-id=#f so will operate on main.db
+;;
+(define (db:insert-run dbstruct target runname run-meta)
+  (let* ((keys (db:get-keys dbstruct))
+     	 (runs (db:simple-get-runs dbstruct runname #f #f target #f))) ;; runpatt count offset target last-update
+    ;; need to insert run based on target and runname
+    (let* ((targvals (string-split target "/"))
+	   (keystr   (string-intersperse keys ","))
+	   (key?str  (string-intersperse (make-list (length targvals) "?") ","))
+	   (qrystr   (conc "INSERT INTO runs (runname,"keystr") VALUES (?,"key?str")"))
+	   (get-var  (lambda (db qrystr)
+		       (let* ((res #f))
+			 (sqlite3:for-each-row
+			  (lambda row
+			    (set res (car row)))
+			  db qrystr runname)
+			 res))))
+      (if (null? runs)
+	  (db:create-initial-run-record dbstruct runname target))
+      (let* ((run-id (db:get-run-id dbstruct runname target)))
+	(db:with-db
+	 dbstruct
+	 #f #t
+	 (lambda (dbdat db)
+	   (for-each
+	    (lambda (keyval)
+	      (let* ((fieldname (car keyval))
+		     (getqry    (conc "SELECT "fieldname" FROM runs WHERE id=?;"))
+		     (setqry    (conc "UPDATE runs SET "fieldname"=? WHERE id=?;"))
+		     (val       (cdr keyval))
+		     (valnum    (if (number? val)
+				    val
+				    (if (string? val)
+					(string->number val)
+					#f))))
+		(if (not (member fieldname (cons "runname" keys))) ;; don't attempt to tweak these
+		    (let* ((curr-val (get-var db getqry))
+			   (have-it  (or (equal? curr-val val)
+					 (equal? curr-val valnum))))
+		      (if (not have-it)
+			  (sqlite3:execute db setqry (or valnum val) run-id))))))
+	    run-meta)))
+	run-id))))
+  
+(define (db:create-initial-run-record dbstruct runname target)	  
+  (let* ((keys     (db:get-keys dbstruct))
+     	 (targvals (string-split target "/"))
+	 (keystr   (string-intersperse keys ","))
+	 (key?str  (string-intersperse (make-list (length targvals) "?") ","))
+	 (qrystr   (conc "INSERT INTO runs (runname,"keystr") VALUES (?,"key?str")")))
+    (db:with-db
+     dbstruct #f #t
+     (lambda (dbdat db)
+       (apply sqlite3:execute db qrystr runname targvals)))))
+
+(define (db:insert-test dbstruct run-id test-rec)
+  (let* ((testname  (alist-ref "testname" test-rec equal?))
+	 (item-path (alist-ref "item_path" test-rec equal?))
+	 (id        (db:get-test-id dbstruct run-id testname item-path))
+	 (fieldvals (filter (lambda (x)(not (member (car x) '("id" "last_update")))) test-rec))
+	 (setqry    (conc "UPDATE tests SET "(string-intersperse
+					      (map (lambda (dat)
+						     (conc (car dat)"=?"))
+						   fieldvals)
+					      ",")" WHERE id=?;"))
+	 (insqry   (conc "INSERT INTO tests ("(string-intersperse (map (lambda (x) (car x)) fieldvals) ",")
+			 ") VALUES ("(string-intersperse (make-list (length fieldvals) "?") ",")");")))
+    (debug:print 0 *default-log-port* "id: "id"\nset: "setqry"\ninsqry: "insqry)
+    (db:with-db
+     dbstruct
+     run-id #t
+     (lambda (dbdat db)
+       (if id
+	   (apply sqlite3:execute db setqry (append (map cdr fieldvals) (list id)))
+	   (apply sqlite3:execute db insqry (map cdr fieldvals)))))))
+
 ;; replace header and keystr with a call to runs:get-std-run-fields
 ;;
 ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") )
 ;; runpatts: patt1,patt2 ...
 ;;
@@ -1615,17 +1535,13 @@
 		   qrystr
 		   )))
     (debug:print-info 11 *default-log-port* "db:get-runs END qrystr: " qrystr " keypatts: " keypatts " offset: " offset " limit: " count)
     (vector header res)))
 
-
-(define-record simple-run target id runname state status owner event_time)
-(define-record-printer (simple-run x out)
-  (fprintf out "#,(simple-run ~S ~S ~S ~S)"
-	   (simple-run-target x) (simple-run-id x) (simple-run-runname x) (time->string (seconds->local-time (simple-run-event_time x) ))))
-
 ;; simple get-runs
+;;
+;;  records used defined in dbfile
 ;;
 (define (db:simple-get-runs dbstruct runpatt count offset target last-update)
     (let* ((res       '())
 	   (keys       (db:get-keys dbstruct))
 	   (runpattstr (db:patt->like "runname" runpatt))
@@ -1658,23 +1574,23 @@
 		   qrystr
 		   )))
     (debug:print-info 11 *default-log-port* "db:get-runs END qrystr: " qrystr " target: " target " offset: " offset " limit: " count)
     res))
 
-;; TODO: Switch this to use max(update_time) from each run db? Then if using a server there is no disk traffic (using inmem db)
+;; TODO: Switch this to use max(update_time) from each run db? Then if using a server there is no disk traffic (using cachedb db) ???
 ;;
 ;; NOTE: This DOESN'T (necessarily) get the real run ids, but the number of the <number>.db!!
 
 (define (db:get-changed-run-ids since-time)
   (let* ((dbdir      (db:dbfile-path)) ;; (configf:lookup *configdat* "setup" "dbdir"))
-	 (alldbs     (glob (conc dbdir "/.megatest/[0-9]*.db")))
+	 (alldbs     (glob (conc dbdir "/.mtdb/[0-9]*.db*")))
 	 (changed    (filter (lambda (dbfile)
 			       (> (file-modification-time dbfile) since-time))
 			     alldbs)))
     (delete-duplicates
      (map (lambda (dbfile)
-	    (let* ((res (string-match ".*\\/(\\d\\d)\\.db" dbfile)))
+	    (let* ((res (string-match ".*\\/(\\d\\d)\\.db*" dbfile)))
 	      (if res
 		  (string->number (cadr res))
 		  (begin
 		    (debug:print 2 *default-log-port* "WARNING: Failed to process " dbfile " for run-id")
 		    0))))
@@ -1782,12 +1698,11 @@
 (define (db:update-run-stats dbstruct run-id stats)
   ;; (mutex-lock! *db-transaction-mutex*)
   (db:with-db
    dbstruct
    #f
-   #f
-
+   #t
    (lambda (dbdat db)
      ;; remove previous data
      
      (let* ((stmt1 (sqlite3:prepare db "DELETE FROM run_stats WHERE run_id=? AND state=? AND status=?;"))
 	    (stmt2 (sqlite3:prepare db "INSERT INTO run_stats (run_id,state,status,count) VALUES (?,?,?,?);"))
@@ -1942,19 +1857,20 @@
     (debug:print-info 4 *default-log-port* "runs:get-runs-by-patt qry=" qry-str " " runnamepatt)
     ;(print "runs:get-runs-by-patt qry=" qry-str " " runnamepatt)
 
     (vector header 
             (reverse
-             (db:with-db dbstruct #f #f ;; reads db, does not write to it.
-                         (lambda (dbdat db)
-                           (sqlite3:fold-row
-                            (lambda (res . r)
-                              (cons (list->vector r) res))
-                            '()
-                            db
-                            qry-str
-                            runnamepatt)))))))
+             (db:with-db
+	      dbstruct #f #f ;; reads db, does not write to it.
+              (lambda (dbdat db)
+                (sqlite3:fold-row
+                 (lambda (res . r)
+                   (cons (list->vector r) res))
+                 '()
+                 db
+                 qry-str
+                 runnamepatt)))))))
 
 ;; use (get-value-by-header (db:get-header runinfo)(db:get-rows runinfo))
 ;; NOTE: Does NOT return a list of rows (or one row) for the first slot of the vector
 ;;       this is inconsistent with get-runs but it makes some sense.
 ;;
@@ -1991,11 +1907,11 @@
 		      run-id))))
 
 ;; does not (obviously!) removed dependent data. But why not!!?
 (define (db:delete-run dbstruct run-id)
   (db:with-db
-   dbstruct #f #f
+   dbstruct #f #t
    (lambda (dbdat db)
      (sqlite3:with-transaction
       db
       (lambda ()
         (sqlite3:execute db "DELETE FROM test_steps WHERE test_id IN (SELECT id FROM tests WHERE run_id=?);" run-id)
@@ -2023,34 +1939,38 @@
 			user (conc newlockval " " run-id))
        (debug:print-info 1 *default-log-port* "" newlockval " run number " run-id)))))
 
 (define (db:set-run-status dbstruct run-id status msg)
   (db:with-db
-   dbstruct #f #f
+   dbstruct #f #t
    (lambda (dbdat db)
        (if msg
          (sqlite3:execute db "UPDATE runs SET status=?,comment=? WHERE id=?;" status msg run-id)
          (sqlite3:execute db "UPDATE runs SET status=? WHERE id=?;" status run-id)))))
 
+(define (db:set-run-state-status-db dbdat db run-id state status )
+  (sqlite3:execute
+   (db:get-cache-stmth
+    dbdat db "UPDATE runs SET status=?,state=? WHERE id=?;") status state run-id))
+
 (define (db:set-run-state-status dbstruct run-id state status )
   (db:with-db
-   dbstruct #f #f
+   dbstruct #f #t
    (lambda (dbdat db)
-          (sqlite3:execute db "UPDATE runs SET status=?,state=? WHERE id=?;" status state run-id))))
-
-
-
+     (db:set-run-state-status-db dbdat db run-id state status))))
+     
 (define (db:get-run-status dbstruct run-id)
   (let ((res "n/a"))
     (db:with-db
      dbstruct #f #f
      (lambda (dbdat db)
        (sqlite3:for-each-row 
 	(lambda (status)
 	  (set! res status))
-	db
-	"SELECT status FROM runs WHERE id=?;" 
+	(db:get-cache-stmth
+	 dbdat db
+	 "SELECT status FROM runs WHERE id=?;" )
 	run-id)
        res))))
 
 (define (db:get-run-state dbstruct run-id)
   (let ((res "n/a"))
@@ -2058,12 +1978,27 @@
      dbstruct #f #f
      (lambda (dbdat db)
        (sqlite3:for-each-row 
 	(lambda (status)
 	  (set! res status))
-	db
-	"SELECT state FROM runs WHERE id=?;" 
+	(db:get-cache-stmth
+	 dbdat db
+	 "SELECT state FROM runs WHERE id=?;" )
+	run-id)
+       res))))
+
+(define (db:get-run-state-status dbstruct run-id)
+  (let ((res (cons "n/a" "n/a")))
+    (db:with-db
+     dbstruct #f #f
+     (lambda (dbdat db)
+       (sqlite3:for-each-row 
+	(lambda (state status)
+	  (set! res (cons state status)))
+	(db:get-cache-stmth
+	 dbdat db
+	 "SELECT state,status FROM runs WHERE id=?;" )
 	run-id)
        res))))
 
 
 ;;======================================================================
@@ -2306,12 +2241,12 @@
 		  (sqlite3:for-each-row
 		   (lambda (run-id testname item-path state status)
 		     ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment
 		     (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-")))
 		   db 
-		   "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=?;" 
-		   test-id)))
+		   "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" 
+		   test-id run-id)))
     res))
 
 ;; get a useful subset of the tests data (used in dashboard
 ;; use db:mintest-get-{id ,run_id,testname ...}
 ;;
@@ -2338,11 +2273,11 @@
 
 (define (db:delete-test-records dbstruct run-id test-id)
   (db:general-call dbstruct run-id 'delete-test-step-records (list test-id))
   (db:general-call dbstruct run-id 'delete-test-data-records (list test-id))
   (db:with-db
-   dbstruct run-id #f
+   dbstruct run-id #t
    (lambda (dbdat db)
      (sqlite3:execute db "UPDATE tests SET state='DELETED',status='n/a',comment='' WHERE id=?;" test-id))))
 
 ;; 
 (define (db:delete-old-deleted-test-records dbstruct)
@@ -2402,25 +2337,28 @@
 ;;      NOTE: run-id is not used
 ;; ;;
 (define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
   (db:with-db
    dbstruct
-   run-id
-   #t
+   run-id #t
    (lambda (dbdat db)
-     (cond
-      ((and newstate newstatus newcomment)
-       (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
-			test-id))
-      ((and newstate newstatus)
-       (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id))
-      (else
-       (if newstate   (sqlite3:execute db "UPDATE tests SET state=?   WHERE id=?;" newstate   test-id))
-       (if newstatus  (sqlite3:execute db "UPDATE tests SET status=?  WHERE id=?;" newstatus  test-id))
-       (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment)
-				       test-id))))))
-  (mt:process-triggers dbstruct run-id test-id newstate newstatus))
+     (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment))))
+
+(define (db:test-set-state-status-db db run-id test-id newstate newstatus newcomment)
+  (cond
+   ((and newstate newstatus newcomment)
+    (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
+		     test-id))
+   ((and newstate newstatus)
+    (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id))
+   (else
+    (if newstate   (sqlite3:execute db "UPDATE tests SET state=?   WHERE id=?;" newstate   test-id))
+    (if newstatus  (sqlite3:execute db "UPDATE tests SET status=?  WHERE id=?;" newstatus  test-id))
+    (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment)
+				    test-id))))
+  ;; (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NOTE: Moved into calling function
+  )
 
 ;; NEW BEHAVIOR: Count tests running in all runs!
 ;;
 (define (db:get-count-tests-running dbstruct run-id) ;; fastmode)
   (let* ((qry ;; (if fastmode
@@ -2429,11 +2367,11 @@
   (db:with-db
    dbstruct
    run-id
    #f
    (lambda (dbdat db)
-     (let* ((stmth (db:get-cache-stmth dbdat run-id db qry)))
+     (let* ((stmth (db:get-cache-stmth dbdat db qry)))
        (sqlite3:first-result stmth))))))
 
 ;; NEW BEHAVIOR: Count tests running in only one run!
 ;;
 (define (db:get-count-tests-actually-running dbstruct run-id)
@@ -2459,11 +2397,11 @@
     (db:with-db
      dbstruct
      run-id
      #f
      (lambda (dbdat db)
-       (let* ((stmth (db:get-cache-stmth dbdat run-id db qry)))
+       (let* ((stmth (db:get-cache-stmth dbdat db qry)))
 	 (sqlite3:first-result stmth run-id))))))
 
 ;; For a given testname how many items are running? Used to determine
 ;; probability for regenerating html
 ;;
@@ -2472,11 +2410,11 @@
    dbstruct
    run-id
    #f
    (lambda (dbdat db)
      (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")
-	    (stmth (db:get-cache-stmth dbdat run-id db stmt)))
+	    (stmth (db:get-cache-stmth dbdat db stmt)))
        (sqlite3:first-result
 	stmth run-id testname)))))
 
 (define (db:get-not-completed-cnt dbstruct run-id)
 (db:with-db
@@ -2552,11 +2490,11 @@
 ;;
 (define (db:test-set-top-process-pid dbstruct run-id test-id pid)
   (db:with-db
    dbstruct
    run-id
-   #f
+   #t
    (lambda (dbdat db)
      (sqlite3:execute db "UPDATE tests SET attemptnum=? WHERE id=?;"
 		      pid test-id))))
 
 (define (db:test-get-top-process-pid dbstruct run-id test-id)
@@ -2565,13 +2503,13 @@
    run-id
    #f
    (lambda (dbdat db)
      (db:first-result-default 
       db
-      "SELECT attemptnum FROM tests WHERE id=?;"
+      "SELECT attemptnum FROM tests WHERE id=? AND run_id=?;"
       #f
-      test-id))))
+      test-id run-id))))
 
 (define db:test-record-fields '("id"           "run_id"        "testname"  "state"      "status"      "event_time"
 				"host"         "cpuload"       "diskfree"  "uname"      "rundir"      "item_path"
                                 "run_duration" "final_logf"    "comment"   "shortdir"   "attemptnum"  "archived" "last_update"))
 
@@ -2591,11 +2529,11 @@
 
 (define db:test-record-qry-selector (string-intersperse db:test-record-fields ","))
 
 (define (db:update-tesdata-on-repilcate-db dbstruct old-lt new-lt)
   (db:with-db
-   dbstruct   #f   #f
+   dbstruct   #f   #t
    (lambda (dbdat db)
      (sqlite3:execute db "UPDATE tests SET rundir= replace(rundir,?,?), shortdir=replace(shortdir,?,?);"
 		      old-lt new-lt  old-lt new-lt))))
 
 ;; NOTE: Use db:test-get* to access records
@@ -2672,11 +2610,11 @@
      (lambda (run-id)
        (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
 	 (db:prep-megatest.db-adj-test-ids (dbr:dbdat-dbh mtdb) run-id testrecs)))
      run-ids)))
 
-;; Get test data using test_id, run-id is not used
+;; Get test data using test_id
 ;; 
 (define (db:get-test-info-by-id dbstruct run-id test-id)
   (db:with-db
    dbstruct
    run-id
@@ -2686,12 +2624,32 @@
        (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
 	(lambda (id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)
 	  ;;                0    1       2      3      4        5       6      7        8     9     10      11          12          13           14         15          16
 	  (set! res (vector id run-id testname state status event-time host cpuload diskfree uname rundir-id item-path run_duration final-logf-id comment short-dir-id attemptnum archived last-update)))
 	db
-	(conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=?;")
-	test-id)
+	;; (db:get-cache-stmth dbdat db
+	;; 		    (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=? AND run_id=?;"))
+	(conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id=? AND run_id=?;")
+	test-id run-id)
+       res))))
+
+;; Get test state, status using test_id
+;; 
+(define (db:get-test-state-status-by-id dbstruct run-id test-id)
+  (db:with-db
+   dbstruct
+   run-id
+   #f
+   (lambda (dbdat db)
+     (let ((res   (cons #f #f)))
+;;	   (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;")))
+       (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
+	(lambda (state status)
+	  (cons state status))
+	db
+	"SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue
+	test-id run-id)
        res))))
 
 ;; Use db:test-get* to access
 ;; Get test data using test_ids. NB// Only works within a single run!!
 ;;
@@ -2709,36 +2667,54 @@
 	db
 	(conc "SELECT " db:test-record-qry-selector " FROM tests WHERE id in ("
 	      (string-intersperse (map conc test-ids) ",") ");"))
        res))))
 
+;; try every second until tries times proc
+;;
+(define (db:keep-trying-until-true proc params tries)
+  (let* ((res (apply proc params)))
+    (if res
+	res
+	(if (> tries 0)
+	    (begin
+	      (thread-sleep! 1)
+	      (db:keep-trying-until-true proc params (- tries 1)))
+	    (begin
+	      ;; (debug:print-info 0 *default-log-port* "proc never returned true, params="params)
+	      (print"db:keep-trying-until-true proc never returned true, proc = " proc " params =" params " tries = " tries)
+	      #f)))))
+  
 (define (db:get-test-info dbstruct run-id test-name item-path)
   (db:with-db
    dbstruct
    run-id
    #f
    (lambda (dbdat db)
-     (let ((res #f))
-       (sqlite3:for-each-row
-	(lambda (a . b)
-	  (set! res (apply vector a b)))
-	db
-	(conc "SELECT " db:test-record-qry-selector " FROM tests WHERE testname=? AND item_path=? AND run_id=?;")
-	test-name item-path run-id)
-       res))))
+     (db:get-test-info-db db run-id test-name item-path))))
+
+(define (db:get-test-info-db db run-id test-name item-path)
+  (let ((res #f))
+    (sqlite3:for-each-row
+     (lambda (a . b)
+       (set! res (apply vector a b)))
+     db
+     (conc "SELECT " db:test-record-qry-selector " FROM tests WHERE testname=? AND item_path=? AND run_id=?;")
+     test-name item-path run-id)
+    res))
 
 (define (db:test-get-rundir-from-test-id dbstruct run-id test-id)
   (db:with-db
    dbstruct
    run-id
    #f
    (lambda (dbdat db)
      (db:first-result-default
       db
-      "SELECT rundir FROM tests WHERE id=?;"
+      "SELECT rundir FROM tests WHERE id=? AND run_id=?;"
       #f ;; default result
-      test-id))))
+      test-id run-id))))
 
 (define (db:get-test-times dbstruct run-name target)
   (let ((res `())
         (qry 	(conc "select testname, item_path, run_duration, "
 		      (string-join (db:get-keys dbstruct) " || '/' || ")
@@ -2843,11 +2819,11 @@
     (db:with-db
      dbstruct
      run-id
      #f
      (lambda (dbdat db)
-       (let* ((stmth (db:get-cache-stmth dbdat #f db stmt))
+       (let* ((stmth (db:get-cache-stmth dbdat db stmt))
 	      (res   (sqlite3:fold-row
 		      (lambda (res id test-id  category variable value expected tol units comment status type last-update)
 			(vector id test-id  category variable value expected tol units comment status type last-update))
 		      (vector #f #f #f #f #f #f #f #f #f #f #f #f)
 		      stmth
@@ -2861,11 +2837,11 @@
 ;;    if one or more are fail (any case) then set test status to PASS, non "pass" or "fail" are ignored
 (define (db:test-data-rollup dbstruct run-id test-id status)
   (let* ((fail-count 0)
 	 (pass-count 0))
     (db:with-db
-     dbstruct run-id #f
+     dbstruct run-id #t
      (lambda (dbdat db)
        (sqlite3:for-each-row
 	(lambda (fcount pcount)
 	  (set! fail-count fcount)
 	  (set! pass-count pcount))
@@ -2960,11 +2936,11 @@
 ;; EOF
 
 (define (db:csv->test-data dbstruct run-id test-id csvdata)
   (debug:print 4 *default-log-port* "test-id " test-id ", csvdata: " csvdata)
   (db:with-db
-   dbstruct #f #f
+   dbstruct #f #t
    (lambda (dbdat db)
      (let* ((csvlist (csv->list (make-csv-reader
 				 (open-input-string csvdata)
 				 '((strip-leading-whitespace? #t)
 				   (strip-trailing-whitespace? #t)))))) ;; (csv->list csvdata)))
@@ -3168,163 +3144,171 @@
 (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
   ;; establish info on incoming test followed by info on top level test
   ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met
   (let* ((testdat      (if (number? test-name)
 			   (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id
-			   (db:get-test-info       dbstruct run-id test-name item-path)))
+			   (db:keep-trying-until-true ;; in our threaded stuff this call could happen before the test is registered (maybe?)
+			    db:get-test-info
+			    (list dbstruct run-id test-name item-path)
+			    10)))
 	 (test-id      (db:test-get-id testdat))
 	 (test-name    (if (number? test-name)
 			   (db:test-get-testname testdat)
 			   test-name))
 	 (item-path    (db:test-get-item-path testdat))
          (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
          (tl-test-id   (if tl-testdat
 			   (db:test-get-id tl-testdat)
-			   #f)))
+			   #f))
+	 (new-state-eh #f)
+	 (new-status-eh #f))
     (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) 
 	(db:general-call dbstruct run-id 'set-test-start-time (list test-id)))
     (mutex-lock! *db-transaction-mutex*)
     (db:with-db
-     dbstruct run-id #f
+     dbstruct run-id #t
      (lambda (dbdat db)
        (let ((tr-res
               (sqlite3:with-transaction
                db
                (lambda ()
                  ;; NB// Pass the db so it is part fo the transaction
-                 (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status
+                 (db:test-set-state-status-db db run-id test-id state status comment) ;; this call sets the item state/status
                  (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
-                     (let* ((state-status-counts  (db:get-all-state-status-counts-for-test db run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
-			    (state-statuses        (db:roll-up-rules state-status-counts state status))
-                          (newstate (car state-statuses))
-                          (newstatus (cadr state-statuses)))
+                     (let* ((state-status-counts (db:get-all-state-status-counts-for-test db run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
+			    (state-statuses      (db:roll-up-rules state-status-counts state status))
+                            (newstate            (car state-statuses))
+                            (newstatus           (cadr state-statuses)))
+		       (set! new-state-eh newstate)
+		       (set! new-status-eh newstatus)
                        (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
-							(apply conc
-                  (map (lambda (x)
-                     (conc
-                     		(with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
-                              state-status-counts))); end debug:print
-   
-                       (if tl-test-id
-			   (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
+				    (apply conc
+					   (map (lambda (x)
+						  (conc
+                     				   (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
+						state-status-counts))); end debug:print
+		       (if tl-test-id
+			   (db:test-set-state-status-db db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
 		       ))))))
          (mutex-unlock! *db-transaction-mutex*)
          (if (and test-id state status (equal? status "AUTO")) 
              (db:test-data-rollup dbstruct run-id test-id status))
+	 (if new-state-eh ;; moved from db:test-set-state-status
+	      (mt:process-triggers dbstruct run-id test-id new-state-eh new-status-eh))
          tr-res)))))
 
 (define (db:roll-up-rules state-status-counts state status)
-  (let* ((running     (length (filter (lambda (x)
-					(member (dbr:counts-state x) *common:running-states*))
-				      state-status-counts)))
-	 (bad-not-started      (length (filter (lambda (x)
-						 (and (equal? (dbr:counts-state x) "NOT_STARTED") 
-						      (not (member (dbr:counts-status x)  *common:not-started-ok-statuses*))))
-					       state-status-counts)))
-	 (all-curr-states      (common:special-sort  ;; worst -> best (sort of)
-				(delete-duplicates
-				 (if (and state (not (member state *common:dont-roll-up-states*)))
-				     (cons state (map dbr:counts-state state-status-counts))
-				     (map dbr:counts-state state-status-counts)))
-				*common:std-states* >))
-	 (all-curr-statuses    (common:special-sort  ;; worst -> best
-				(delete-duplicates
-				 (if (and state status (not (member state *common:dont-roll-up-states*)))
-				     (cons status (map dbr:counts-status state-status-counts))
-				     (map dbr:counts-status state-status-counts)))
-				*common:std-statuses* >))
-	 (non-completes        (filter (lambda (x)
-					 (not (member x (cons "COMPLETED" *common:dont-roll-up-states*))))
-				       all-curr-states))
-	 (preq-fails        (filter (lambda (x)
-				      (equal? x "PREQ_FAIL"))
-				    all-curr-statuses))
-	 (num-non-completes (length non-completes))
-	 (newstate          (cond
-			     ((> running 0)           "RUNNING")            ;; anything running, call the situation running
-			     ((> (length preq-fails) 0) "NOT_STARTED")
-			     ((> bad-not-started 0)   "COMPLETED")          ;; we have an ugly situation, it is completed in the sense we cannot do more.
-			     ((> num-non-completes 0) (car non-completes))  ;;  (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED
-			     (else                    (car all-curr-states))))
-	 (newstatus         (cond
-			     ((> (length preq-fails) 0)  "PREQ_FAIL")
-			     ((or (> bad-not-started 0)
-				  (and (equal? newstate "NOT_STARTED")
-				       (> num-non-completes 0)))
-			      "STARTED")
-			     (else (car all-curr-statuses)))))
-    (debug:print-info 2 *default-log-port*
-		      "\n--> probe db:set-state-status-and-roll-up-items: "
-		      "\n--> state-status-counts: "(map dbr:counts->alist state-status-counts)
-		      "\n--> running:             "running
-		      "\n--> bad-not-started:     "bad-not-started
-		      "\n--> non-non-completes:   "num-non-completes
-		      "\n--> non-completes:       "non-completes
-		      "\n--> all-curr-states:     "all-curr-states
-		      "\n--> all-curr-statuses:     "all-curr-statuses
-		      "\n--> newstate              "newstate
-		      "\n--> newstatus            "newstatus
-		      "\n\n")
-    
-    ;; NB// Pass the db so it is part of the transaction
-    (list newstate newstatus)))
+  (if (null? state-status-counts)
+      '(#f #f)
+      (let* ((running     (length (filter (lambda (x)
+					    (member (dbr:counts-state x) *common:running-states*))
+					  state-status-counts)))
+	     (bad-not-started      (length (filter (lambda (x)
+						     (and (equal? (dbr:counts-state x) "NOT_STARTED") 
+							  (not (member (dbr:counts-status x)  *common:not-started-ok-statuses*))))
+						   state-status-counts)))
+	     (all-curr-states      (common:special-sort  ;; worst -> best (sort of)
+				    (delete-duplicates
+				     (if (and state (not (member state *common:dont-roll-up-states*)))
+					 (cons state (map dbr:counts-state state-status-counts))
+					 (map dbr:counts-state state-status-counts)))
+				    *common:std-states* >))
+	     (all-curr-statuses    (common:special-sort  ;; worst -> best
+				    (delete-duplicates
+				     (if (and state status (not (member state *common:dont-roll-up-states*)))
+					 (cons status (map dbr:counts-status state-status-counts))
+					 (map dbr:counts-status state-status-counts)))
+				    *common:std-statuses* >))
+	     (non-completes        (filter (lambda (x)
+					     (not (member x (cons "COMPLETED" *common:dont-roll-up-states*))))
+					   all-curr-states))
+	     (preq-fails        (filter (lambda (x)
+					  (equal? x "PREQ_FAIL"))
+					all-curr-statuses))
+	     (num-non-completes (length non-completes))
+	     (newstate          (cond
+				 ((> running 0)           "RUNNING")            ;; anything running, call the situation running
+				 ((> (length preq-fails) 0) "NOT_STARTED")
+				 ((> bad-not-started 0)   "COMPLETED")          ;; we have an ugly situation, it is completed in the sense we cannot do more.
+				 ((> num-non-completes 0) (car non-completes))  ;;  (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED
+				 (else                    (car all-curr-states))))
+	     (newstatus         (cond
+				 ((> (length preq-fails) 0)  "PREQ_FAIL")
+				 ((or (> bad-not-started 0)
+				      (and (equal? newstate "NOT_STARTED")
+					   (> num-non-completes 0)))
+				  "STARTED")
+				 (else (car all-curr-statuses)))))
+	(debug:print-info 2 *default-log-port*
+			  "\n--> probe db:set-state-status-and-roll-up-items: "
+			  "\n--> state-status-counts: "(map dbr:counts->alist state-status-counts)
+			  "\n--> running:             "running
+			  "\n--> bad-not-started:     "bad-not-started
+			  "\n--> non-non-completes:   "num-non-completes
+			  "\n--> non-completes:       "non-completes
+			  "\n--> all-curr-states:     "all-curr-states
+			  "\n--> all-curr-statuses:     "all-curr-statuses
+			  "\n--> newstate              "newstate
+			  "\n--> newstatus            "newstatus
+			  "\n\n")
+	
+	;; NB// Pass the db so it is part of the transaction
+	(list newstate newstatus))))
 
 (define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status)
     (mutex-lock! *db-transaction-mutex*)
     (db:with-db
-     dbstruct run-id #f
+     dbstruct run-id #t
      (lambda (dbdat db)
        (let ((tr-res
               (sqlite3:with-transaction
                db
                (lambda ()
-                   (let* ((state-status-counts (db:get-all-state-status-counts-for-run db run-id))
+                   (let* ((state-status-counts (db:get-all-state-status-counts-for-run-db dbdat db run-id))
 			  (state-statuses      (db:roll-up-rules state-status-counts #f #f ))
                           (newstate            (car state-statuses))
                           (newstatus           (cadr state-statuses))) 
 		     (if (or (not (eq? newstate curr-state)) (not (eq?  newstatus curr-status)))
-			 (db:set-run-state-status db run-id newstate newstatus )))))))
+			 (db:set-run-state-status-db dbdat db run-id newstate newstatus )))))))
          (mutex-unlock! *db-transaction-mutex*)
          tr-res))))
+
+(define (db:get-all-state-status-counts-for-run-db dbdat db run-id)
+  (sqlite3:map-row
+   (lambda (state status count)
+     (make-dbr:counts state: state status: status count: count))
+   (db:get-cache-stmth
+    dbdat db
+    "SELECT state,status,count(id) FROM tests WHERE run_id=?  GROUP BY state,status;")
+   run-id ))
 
 (define (db:get-all-state-status-counts-for-run dbstruct run-id)
- (let* ((test-count-recs (db:with-db
-                          dbstruct #f #f
-                          (lambda (dbdat db)
-                            (sqlite3:map-row
-                             (lambda (state status count)
-                               (make-dbr:counts state: state status: status count: count))
-                             db
-                             "SELECT state,status,count(id) FROM tests WHERE run_id=?  GROUP BY state,status;"
-                             run-id )))))
-   test-count-recs))
-
+  (db:with-db
+   dbstruct #f #f
+   (lambda (dbdat db)
+     (db:get-all-state-status-counts-for-run-db dbdat db run-id))))
 
 ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status*
 ;;
 ;; NOTE: This is called within a transaction
 ;;
-(define (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path item-state-in item-status-in)
-  (let* ((test-info   (db:get-test-info dbstruct run-id test-name item-path))
+(define (db:get-all-state-status-counts-for-test db run-id test-name item-path item-state-in item-status-in)
+  (let* ((test-info   (db:get-test-info-db db run-id test-name item-path))
          (item-state  (or item-state-in (db:test-get-state test-info))) 
          (item-status (or item-status-in (db:test-get-status test-info)))
-         (other-items-count-recs (db:with-db
-                                  dbstruct run-id #f
-                                  (lambda (dbdat db)
-                                    (sqlite3:map-row
-                                     (lambda (state status count)
-                                       (make-dbr:counts state: state status: status count: count))
-                                     db
-                                     ;; ignore current item because we have changed its value in the current transation so this select will see the old value.
-                                     "SELECT state,status,count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND item_path !=? GROUP BY state,status;"
-                                     run-id test-name item-path))))
-
+         (other-items-count-recs (sqlite3:map-row
+                                  (lambda (state status count)
+                                    (make-dbr:counts state: state status: status count: count))
+                                  db
+                                  ;; ignore current item because we have changed its value in the current transation so this select will see the old value.
+                                  "SELECT state,status,count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND item_path !=? GROUP BY state,status;"
+                                  run-id test-name item-path))
          ;; add current item to tally outside of sql query
-         (match-countrec-lambda (lambda (countrec) 
-                                  (and (equal? (dbr:counts-state  countrec) item-state)
+	 (match-countrec-lambda (lambda (countrec) 
+				  (and (equal? (dbr:counts-state  countrec) item-state)
                                        (equal? (dbr:counts-status countrec) item-status))))
-
+	 
          (already-have-count-rec-list
           (filter match-countrec-lambda other-items-count-recs)) ;; will have either 0 or 1 count recs depending if another item shares this item's state/status
          
          (updated-count-rec    (if (null? already-have-count-rec-list)
                                    (make-dbr:counts state: item-state status: item-status count: 1)
@@ -3334,11 +3318,10 @@
 
          (nonmatch-countrec-lambda (lambda (countrec) (not (match-countrec-lambda countrec))))
          
          (unrelated-rec-list   
           (filter nonmatch-countrec-lambda other-items-count-recs)))
-    
     (cons updated-count-rec unrelated-rec-list)))
 
 ;; (define (db:get-all-item-states db run-id test-name)
 ;;   (sqlite3:map-row 
 ;;    (lambda (a) a)
@@ -3542,19 +3525,21 @@
    
     (else
     (hash-table-set! *logged-in-clients* client-signature (current-seconds))
     '(#t "successful login"))))
 
+;; NO WAY TO KNOW IF IT MODIFIES THE DB BUT NEARLY ALL ARE UPDATES/INSERTS
+;;
 (define (db:general-call dbstruct run-id stmtname params)
   ;; Why is db:lookup-query above not used here to get the query?
   (let ((query (let ((q (alist-ref (if (string? stmtname)
 				       (string->symbol stmtname)
 				       stmtname)
 				   db:queries)))
  		 (if q (car q) #f))))
     (db:with-db
-     dbstruct run-id #f
+     dbstruct run-id #t
      (lambda (dbdat db)
        (apply sqlite3:execute db query params)
        #t))))
 
 ;; get a summary of state and status counts to calculate a rollup
@@ -3762,19 +3747,19 @@
 	testname)
        res))))
 
 ;; create a new record for a given testname
 (define (db:testmeta-add-record dbstruct testname)
-  (db:with-db dbstruct #f #f 
+  (db:with-db dbstruct #f #t
 	      (lambda (dbdat db)
 		(sqlite3:execute 
 		 db
 		 "INSERT OR IGNORE INTO test_meta (testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags) VALUES (?,'','','','','','','','');" testname))))
 
 ;; update one of the testmeta fields
 (define (db:testmeta-update-field dbstruct testname field value)
-  (db:with-db dbstruct #f #f 
+  (db:with-db dbstruct #f #t
 	      (lambda (dbdat db)
 		(sqlite3:execute 
 		 db
 		 (conc "UPDATE test_meta SET " field "=? WHERE testname=?;") value testname))))
 
@@ -4032,12 +4017,10 @@
 ;; To sync individual run
 ;;======================================================================
 (define (db:get-run-record-ids dbstruct target run keynames test-patt)
    (let* ((backcons (lambda (lst item)(cons item lst)))
          (all_tests '())
-         (all_test_steps '())
-         (all_test_data '())
          (keystr (string-intersperse 
 	                  (map (lambda (key val)
 			    (conc key " like '" val "'"))
 			     keynames 
 			     (string-split target "/"))
@@ -4063,39 +4046,15 @@
                   )
                 )
                ) all_tests
               )
             )
-            (set! all_test_steps 
-              (append 
-                (map (lambda (x) (cons x run_id))
-                  (db:with-db dbstruct run_id #f 
-                    (lambda (dbdat db)
-                      (sqlite3:fold-row backcons '() db (conc "SELECT id FROM test_steps  WHERE test_id in (" test-qry ")"))
-                    )
-                  )
-                ) all_test_steps
-              )
-            )
-            (set! all_test_data 
-              (append 
-                (map (lambda (x) (cons x run_id))
-                  (db:with-db dbstruct run_id #f 
-                    (lambda (dbdat db)
-                      (sqlite3:fold-row backcons '() db (conc "SELECT id FROM test_data  WHERE test_id in (" test-qry ")"))
-                    )
-                  )
-                ) all_test_data
-              )
-            )
           )
           run_ids
         )
       `((runs       . ,run_ids)
         (tests      . ,all_tests)
-        (test_steps . ,all_test_steps)
-        (test_data  . ,all_test_data)
        )
      
    )
 )
 
@@ -4103,17 +4062,27 @@
 ;; Just for sync, procedures to make sync easy
 ;;======================================================================
 
 ;; get an alist of run ids and test/run, test_step/run pairs changed since time since-time
 ;;   '((runs . (1 2 3 ...))(tests . ((5 . 1) (6 . 3) (6 . 2) (7 . 1)  ...
+
+;; Retrieves record IDs from the database based on the timestamp of their last update.
+
+;; The function takes two arguments: dbstruct, which represents the database structure, and since-time, which is a timestamp indicating the time of the last update.
+;; The function first defines a few helper functions, including backcons, which takes a list and an item and adds the item to the front of the list. 
+;; It then initializes several variables to empty lists: all_tests, all_test_steps, all_test_data, all_run_ids, and all_test_ids.
+;; The function then retrieves a list of IDs for runs that have been changed since since-time using the db:get-changed-run-ids function. 
+;; It then filters the full list of run IDs to only include those that match the changed run IDs based on their modulo 100.
+;; For each changed run ID, the function retrieves a list of test IDs, test step IDs, and test data IDs that have been updated since since-time. 
+;; It appends these IDs to the appropriate lists (all_tests, all_test_steps, and all_test_data) using the append and map functions.
+;; The function then retrieves a list of run stat IDs that have been updated since since-time.
+;; Finally, the function returns a list of associations between record types and their corresponding IDs: runs, tests, test_steps, test_data, and run_stats.                                
 ;;
 (define (db:get-changed-record-ids dbstruct since-time)
   ;; no transaction, allow the db to be accessed between the big queries
   (let* ((backcons (lambda (lst item)(cons item lst)))
          (all_tests '())
-         (all_test_steps '())
-         (all_test_data '())
          (changed_run_dbs (db:get-changed-run-ids since-time)) ;; gets the rundb numbers
          (all_run_ids 
           (db:with-db dbstruct #f #f 
             (lambda (dbdat db)
               (sqlite3:fold-row backcons '() db "SELECT id FROM runs"))
@@ -4124,16 +4093,10 @@
           (db:with-db dbstruct #f #f 
             (lambda (dbdat db)
               (sqlite3:fold-row backcons '() db "SELECT id FROM runs  WHERE last_update>=?" since-time))
           )
          )
-         (run_stat_ids
-          (db:with-db dbstruct #f #f 
-            (lambda (dbdat db)
-              (sqlite3:fold-row backcons '() db "SELECT id FROM run_stats  WHERE last_update>=?" since-time))
-          )
-         )
         )
         (for-each
           (lambda (run_id)
            (set! all_tests 
              (append 
@@ -4144,43 +4107,18 @@
                   )
                 )
                ) all_tests
               )
             )
-            (set! all_test_steps 
-              (append 
-                (map (lambda (x) (cons x run_id))
-                  (db:with-db dbstruct run_id #f 
-                    (lambda (dbdat db)
-                      (sqlite3:fold-row backcons '() db "SELECT id FROM test_steps WHERE last_update>=?" since-time)
-                    )
-                  )
-                ) all_test_steps
-              )
-            )
-            (set! all_test_data 
-              (append 
-                (map (lambda (x) (cons x run_id))
-                  (db:with-db dbstruct run_id #f 
-                    (lambda (dbdat db)
-                      (sqlite3:fold-row backcons '() db "SELECT id FROM test_data  WHERE last_update>=?" since-time)
-                    )
-                  )
-                ) all_test_data
-              )
-            )
           )
           changed_run_ids
         )
         (debug:print 2 *default-log-port*  "run_ids = " run_ids)
         (debug:print 2 *default-log-port*  "all_tests = " all_tests)
 
       `((runs       . ,run_ids)
         (tests      . ,all_tests)
-        (test_steps . ,all_test_steps)
-        (test_data  . ,all_test_data)
-        (run_stats  . ,run_stat_ids)
        )
   )
 )
 
 ;;======================================================================
@@ -4370,17 +4308,17 @@
 
 ;; sync for filesystem local db writes
 ;;
 (define (db:run-lock-and-sync no-sync-db)
   (let* ((tmp-area       (common:get-db-tmp-area))
-	 (dbfiles        (glob (conc tmp-area"/.megatest/*.db")))
+	 (dbfiles        (glob (conc tmp-area"/.mtdb/*.db")))
 	 (sync-durations (make-hash-table)))
     ;; (debug:print-info 0 *default-log-port* "lock-and-sync, dbfiles: "dbfiles)
     (for-each
      (lambda (file)
        (let* ((fname (conc (pathname-file file) ".db"))
-	      (fulln (conc *toppath*"/.megatest/"fname))
+	      (fulln (conc *toppath*"/.mtdb/"fname))
 	      (time1 (if (file-exists? file)
 			 (file-modification-time file)
 			 (begin
 			   (debug:print-info 0 *default-log-port* "Sync - I do not see file "file)
 			   1)))
@@ -4427,11 +4365,10 @@
  	(last-time          (current-seconds))     ;; last time through the sync loop
  	(no-sync-db         (db:open-no-sync-db))
  	(sync-duration      0)  ;; run time of the sync in milliseconds
 	(tmp-area           (common:get-db-tmp-area)))
     ;; Sync moved to http-transport keep-running loop
-    (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls
     (debug:print-info 2 *default-log-port* "Periodic copy-based sync thread started. syncer is copy-sync, tmp-area is " tmp-area)
     (debug:print-info 3 *default-log-port* "watchdog starting. syncer is copy-sync pid="(current-process-id));;  " this-wd-num="this-wd-num)
     
     (if (and legacy-sync (not *time-to-exit*))
  	(begin
@@ -4470,11 +4407,10 @@
  	(last-time    (current-seconds))
  	(no-sync-db   (db:open-no-sync-db))
  	(stmt-cache   #f) ;; (dbr:dbstruct-stmt-cache dbstruct))
  	(sync-duration 0) ;; run time of the sync in milliseconds
        (subdbs       (hash-table-values (dbr:dbstruct-subdbs dbstruct))))
-   (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls
    (debug:print-info 2 *default-log-port* "Periodic sync thread started.")
    (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)  );;  " this-wd-num="this-wd-num)
    
    (if (and legacy-sync (not *time-to-exit*))
  	(begin
@@ -4612,15 +4548,20 @@
 					(begin
 					  (sqlite3:interrupt! db)
 					  (sqlite3:finalize! db #t)
 					  ;; (vector-set! *task-db* 0 #f)
 					  (set! *task-db* #f)))))
+			      (if (and *no-sync-db*
+				       (sqlite3:database? *no-sync-db*))
+				  (sqlite3:finalize! *no-sync-db* #t))
 			      (if (and (not (args:get-arg "-server"))
-				       *runremote*)
+				       *runremote*
+				       (eq? (rmt:transport-mode) 'http))
 				  (begin
 				    (debug:print-info 0 *default-log-port* "Closing all client connections...")
-				    (http-client#close-all-connections!)))
+				    (http-transport:close-connections *runremote*)
+				    #;(http-client#close-all-connections!)))
                               ;; (if (and *runremote*
                               ;;          (remote-conndat *runremote*))
                               ;;     (begin
                               ;;       (http-client#close-all-connections!))) ;; for http-client
                               (if (not (eq? *default-log-port* (current-error-port)))

Index: db_records.scm
==================================================================
--- db_records.scm
+++ db_records.scm
@@ -16,76 +16,10 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 ;; dbstruct
 ;;======================================================================
-
-;;
-;; -path-|-megatest.db
-;;       |-db-|-main.db
-;;            |-monitor.db
-;;            |-sdb.db
-;;            |-fdb.db
-;;            |-1.db
-;;            |-<N>.db
-;;
-;;
-;; Accessors for a dbstruct
-;;
-
-;; (define-inline (dbr:dbstruct-main    vec)    (vector-ref  vec 0)) ;; ( db path )
-;; (define-inline (dbr:dbstruct-strdb   vec)    (vector-ref  vec 1)) ;; ( db path )
-;; (define-inline (dbr:dbstruct-path    vec)    (vector-ref  vec 2)) 
-;; (define-inline (dbr:dbstruct-local   vec)    (vector-ref  vec 3))
-;; (define-inline (dbr:dbstruct-rundb   vec)    (vector-ref  vec 4)) ;; ( db path )
-;; (define-inline (dbr:dbstruct-inmem   vec)    (vector-ref  vec 5)) ;; ( db #f )
-;; (define-inline (dbr:dbstruct-mtime   vec)    (vector-ref  vec 6))
-;; (define-inline (dbr:dbstruct-rtime   vec)    (vector-ref  vec 7))
-;; (define-inline (dbr:dbstruct-stime   vec)    (vector-ref  vec 8))
-;; (define-inline (dbr:dbstruct-inuse   vec)    (vector-ref  vec 9))
-;; (define-inline (dbr:dbstruct-refdb   vec)    (vector-ref  vec 10)) ;; ( db path )
-;; (define-inline (dbr:dbstruct-locdbs  vec)    (vector-ref  vec 11))
-;; (define-inline (dbr:dbstruct-olddb   vec)    (vector-ref  vec 12)) ;; ( db path )
-;; ;; (define-inline (dbr:dbstruct-main-path vec)  (vector-ref  vec 13))
-;; ;; (define-inline (dbr:dbstruct-rundb-path vec) (vector-ref  vec 14))
-;; ;; (define-inline (dbr:dbstruct-run-id  vec)    (vector-ref  vec 13))
-;; 
-;; (define-inline (dbr:dbstruct-main-set!   vec val)(vector-set! vec 0 val))
-;; (define-inline (dbr:dbstruct-strdb-set!  vec val)(vector-set! vec 1 val))
-;; (define-inline (dbr:dbstruct-path-set!   vec val)(vector-set! vec 2 val))
-;; (define-inline (dbr:dbstruct-local-set!  vec val)(vector-set! vec 3 val))
-;; (define-inline (dbr:dbstruct-rundb-set!  vec val)(vector-set! vec 4 val))
-;; (define-inline (dbr:dbstruct-inmem-set!  vec val)(vector-set! vec 5 val))
-;; (define-inline (dbr:dbstruct-mtime-set!  vec val)(vector-set! vec 6 val))
-;; (define-inline (dbr:dbstruct-rtime-set!  vec val)(vector-set! vec 7 val))
-;; (define-inline (dbr:dbstruct-stime-set!  vec val)(vector-set! vec 8 val))
-;; (define-inline (dbr:dbstruct-inuse-set!  vec val)(vector-set! vec 9 val))
-;; (define-inline (dbr:dbstruct-refdb-set!  vec val)(vector-set! vec 10 val))
-;; (define-inline (dbr:dbstruct-locdbs-set! vec val)(vector-set! vec 11 val))
-;; (define-inline (dbr:dbstruct-olddb-set!  vec val)(vector-set! vec 12 val))
-;; (define-inline (dbr:dbstruct-main-path-set! vec val)(vector-set! vec 13 val))
-;; (define-inline (dbr:dbstruct-rundb-path-set! vec val)(vector-set! vec 14 val))
-;; 
-; (define-inline (dbr:dbstruct-run-id-set! vec val)(vector-set! vec 13 val))
-
-;; constructor for dbstruct
-;;
-;; (define (make-dbr:dbstruct #!key (path #f)(local #f))
-;;   (let ((v (make-vector 15 #f)))
-;;     (dbr:dbstruct-path-set! v path)
-;;     (dbr:dbstruct-local-set! v local)
-;;     (dbr:dbstruct-locdbs-set! v (make-hash-table))
-;;     v))
-
-;; Returns the database for a particular run-id fron the dbstruct:localdbs
-;;
-(define (dbr:dbstruct-localdb v run-id)
-  (hash-table-ref/default (dbr:dbstruct-locdbs v) run-id #f))
-
-(define (dbr:dbstruct-localdb-set! v run-id db)
-  (hash-table-set! (dbr:dbstruct-locdbs v) run-id db))
-
 
 (define (make-db:test)(make-vector 20))
 (define-inline (db:test-get-id           vec) (vector-ref vec 0))
 (define-inline (db:test-get-run_id       vec) (vector-ref vec 1))
 (define-inline (db:test-get-testname     vec) (vector-ref vec 2))

Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -15,35 +15,56 @@
 ;; 
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
+
+(use srfi-18)
 
 (declare (unit dbfile))
-;; (declare (uses debugprint))
+(declare (uses debugprint))
 (declare (uses commonmod))
 
 (module dbfile
 	*
 	
   (import scheme
 	  chicken
 	  data-structures
 	  extras
-	  matchable)
-  
-(import (prefix sqlite3 sqlite3:)
-	posix typed-records srfi-18 srfi-1
-	srfi-69
-	stack
-	files
-	ports
-
-	commonmod
-	)
-
-;; (import debugprint)
+	  matchable
+  
+	  (prefix sqlite3 sqlite3:)
+	  posix typed-records
+
+	  srfi-18
+	  srfi-1
+	  srfi-69
+	  stack
+	  files
+	  ports
+	  
+	  commonmod
+	  debugprint
+	  )
+
+;; parameters
+;;
+(define dbfile:testsuite-name (make-parameter #f))
+
+(define keep-age-param        (make-parameter 10))      ;; qif file age, if over move to attic
+(define num-run-dbs           (make-parameter 10))      ;; number of db's in .mtdb
+(define dbfile:sync-method    (make-parameter 'attach)) ;; 'attach or 'original
+(define dbfile:cache-method   (make-parameter 'cachedb))  ;; 'direct
+(define dbcache-mode (make-parameter 'tmp)) ;; 'cachedb, 'tmp (changes what open cachedb routine does)
+
+
+;; 'original     - use old condition code
+;; 'suicide-mode - create mtrah/stop-the-train with info on what went wrong
+;; else use no condition code (should be production mode)
+;;
+(define no-condition-db-with-db (make-parameter 'suicide-mode))
 
 ;;======================================================================
 ;;  R E C O R D S
 ;;======================================================================
 
@@ -54,21 +75,32 @@
   (areapath  #f)
   (homehost  #f)
   (tmppath   #f)
   (read-only #f)
   (subdbs (make-hash-table))
+  ;;
+  ;; for the cachedb approach (see dbmod.scm)
+  ;; this is one db per server
+  (cachedb     #f)  ;; handle for the in memory copy
+  (dbfile    #f)  ;; path to the db file on disk
+  (dbfname   #f)  ;; short name of db file on disk (used to validate accessing correct db)
+  (ondiskdb  #f)  ;; handle for the on-disk file
+  (dbtmpname #f)  ;; path to db file in /tmp (non-imem method)
+  (dbdat     #f)  ;; create a dbdat for the downstream calls such as db:with-db
+  (last-update 0)
+  (sync-proc #f)
   )
 
 ;; NOTE: Need one dbr:subdb per main.db, 1.db ...
 ;;
 (defstruct dbr:subdb
-  (dbname      #f) ;; .megatest/1.db
-  (mtdbfile    #f) ;; mtrah/.megatest/1.db
+  (dbname      #f) ;; .mtdb/1.db
+  (mtdbfile    #f) ;; mtrah/.mtdb/1.db
   (mtdbdat     #f) ;; only need one of these for syncing
   ;; (dbdats      (make-hash-table))  ;; id => dbdat 
-  (tmpdbfile   #f) ;; /tmp/.../.megatest/1.db
-  ;; (refndbfile  #f) ;; /tmp/.../.megatest/1.db_ref
+  (tmpdbfile   #f) ;; /tmp/.../.mtdb/1.db
+  ;; (refndbfile  #f) ;; /tmp/.../.mtdb/1.db_ref
   (dbstack     (make-stack)) ;; stack for tmp dbr:dbdat,
   (homehost    #f) ;; not used yet
   (on-homehost #f) ;; not used yet
   (read-only   #f)
   (last-sync   0)
@@ -81,10 +113,16 @@
   (dbh         #f)    
   (stmt-cache  (make-hash-table))
   (read-only   #f)
   (birth-sec   (current-seconds)))
 
+;; used in simple-get-runs (thanks Brandon!)
+(define-record simple-run target id runname state status owner event_time)
+(define-record-printer (simple-run x out)
+  (fprintf out "#,(simple-run ~S ~S ~S ~S)"
+	   (simple-run-target x) (simple-run-id x) (simple-run-runname x) (time->string (seconds->local-time (simple-run-event_time x) ))))
+
 (define *dbstruct-dbs* #f)
 (define *db-open-mutex* (make-mutex))
 (define *db-access-mutex* (make-mutex)) ;; used in common.scm
 (define *no-sync-db*   #f)
 (define *db-sync-in-progress* #f)
@@ -92,10 +130,13 @@
 (define *max-api-process-requests* 0)
 (define *api-process-request-count* 0)
 (define *db-write-access*     #t)
 (define *db-last-sync*        0)                 ;; last time the sync to megatest.db happened
 (define *db-multi-sync-mutex* (make-mutex))      ;; protect access to *db-sync-in-progress*, *db-last-sync*
+(define *db-last-access*      (current-seconds))
+
+(define *db-transaction-mutex* (make-mutex))
 
 (define (db:generic-error-printout exn . message)
   (print-call-chain (current-error-port))
   (apply dbfile:print-err message)
   (dbfile:print-err
@@ -157,51 +198,37 @@
           )
           #f
   )
 )
 
-;; ;; set up a single db (e.g. main.db, 1.db ... etc.)
-;; ;;
-;; (define (db:setup-db dbstruct areapath run-id)
-;;   (let* ((dbname   (db:run-id->dbname run-id))
-;; 	 (dbstruct (hash-table-ref/default dbstructs dbname #f)))
-;;     (if dbstruct
-;; 	dbstruct
-;; 	(let* ((dbstruct-new (make-dbr:dbstruct)))
-;; 	  (db:open-db dbstruct-new run-id areapath: areapath do-sync: #t)
-;; 	  (hash-table-set! dbstructs dbname dbstruct-new)
-;; 	  dbstruct-new))))
-    
-;; ; Returns the dbdat for a particular dbfile inside the area
-;; ;;
-;; (define (dbr:dbstruct-get-dbdat dbstruct dbfile)
-;;   (hash-table-ref/default (dbr:dbstruct-dbdats dbstruct) dbfile #f))
-;; 
-;; (define (dbr:dbstruct-dbdat-put! dbstruct dbfile db)
-;;   (hash-table-set! (dbr:dbstruct-dbdats dbstruct) dbfile db))
-;; 
-;; (define (db:run-id->first-num run-id)
-;;   (let* ((s (number->string run-id))
-;; 	 (l (string-length s)))
-;;     (substring s (- l 1) l)))
-
-;; 1234 => 4/1234.db
-;;   #f => 0/main.db
-;;   (abandoned the idea of num/db)
-;; 
+(define (dbfile:make-tmpdir-name areapath)
+  (let* ((dname (conc "/tmp/"(current-user-name)"/" (string-translate areapath "/" "."))))
+    (create-directory dname #t)
+    dname))
+
 (define (dbfile:run-id->path apath run-id)
   (conc apath"/"(dbfile:run-id->dbname run-id)))
 
 (define (db:dbname->path apath dbname)
   (conc apath"/"dbname))
 
-;; POTENTIAL BUG: this implementation could produce a db file if run-id is neither #f or a number
-(define (dbfile:run-id->dbname run-id)
+(define (dbfile:run-id->dbnum run-id)
   (cond
-   ((number? run-id) (conc ".megatest/" (modulo run-id 100) ".db"))
-   ((not run-id)     (conc ".megatest/main.db"))
-   (else             run-id)))
+   ((number? run-id)
+    (modulo run-id (num-run-dbs)))
+   ((not run-id) "main")   ;; 0 or main? No, not 0. 
+   (else
+    (assert #f "FATAL: run-id is required to be a number or #f"))))
+
+;; just the filename
+(define (dbfile:run-id->dbfname run-id)
+  (conc (dbfile:run-id->dbnum run-id)".db"))
+
+;; the path in MTRAH with the filename
+(define (dbfile:run-id->dbname run-id)
+  (conc ".mtdb/"(dbfile:run-id->dbfname run-id)))
+
 
 ;; Make the dbstruct, setup up auxillary db's and call for main db at least once
 ;;
 ;; called in http-transport and replicated in rmt.scm for *local* access. 
 ;;
@@ -209,14 +236,12 @@
   (cond
    (*dbstruct-dbs*
     (dbfile:print-err "WARNING: dbfile:setup called when *dbstruct-dbs* is already initialized")
     *dbstruct-dbs*) ;; TODO: when multiple areas are supported, this optimization will be a hazard
    (else
-    (let* ((dbstruct (make-dbr:dbstruct)))
+    (let* ((dbstruct (make-dbr:dbstruct areapath: areapath tmppath: tmppath)))
       (set! *dbstruct-dbs* dbstruct)
-      (dbr:dbstruct-areapath-set! dbstruct areapath)
-      (dbr:dbstruct-tmppath-set!  dbstruct tmppath)
       dbstruct))))
 
 (define (dbfile:get-subdb dbstruct run-id)
   (let* ((dbfname (dbfile:run-id->dbname run-id)))
     (hash-table-ref/default (dbr:dbstruct-subdbs dbstruct) dbfname #f)))
@@ -224,16 +249,19 @@
 (define (dbfile:set-subdb dbstruct run-id subdb)
   (hash-table-set! (dbr:dbstruct-subdbs dbstruct) (dbfile:run-id->dbname run-id) subdb))
 
 ;; (define *dbfile:num-handles-in-use* 0)
 
-;; Get/open a database
+;; Get/open a database.
+;;
+;;    NOTE: most usage should call dbfile:open-db to get a dbdat
+;;
 ;;    if run-id => get run specific db
 ;;    if #f     => get main db
-;;    if run-id is a string treat it as a filename
-;;    if db already open - return inmem
-;;    if db not open, open inmem, rundb and sync then return inmem
+;;    if run-id is a string treat it as a filename - DON'T use this - we'll get rid of it.
+;;    if db already open - return cachedb
+;;    if db not open, open cachedb, rundb and sync then return cachedb
 ;;    inuse gets set automatically for rundb's
 ;;
 (define (dbfile:get-dbdat dbstruct run-id)
   (let* ((subdb (dbfile:get-subdb dbstruct run-id)))
     (if (stack-empty? (dbr:subdb-dbstack subdb))
@@ -241,12 +269,16 @@
 	(begin
 	  (stack-pop! (dbr:subdb-dbstack subdb))))))
 
 ;; return a previously opened db handle to the stack of available handles
 (define (dbfile:add-dbdat dbstruct run-id dbdat)
-  (let* ((subdb (dbfile:get-subdb dbstruct run-id)))
-    (stack-push! (dbr:subdb-dbstack subdb) dbdat)
+  (let* ((subdb (dbfile:get-subdb dbstruct run-id))
+	 (dbstk (dbr:subdb-dbstack subdb))
+	 (count (stack-count dbstk)))
+    (if (> count 15)
+	(dbfile:print-err "WARNING: stack for "run-id".db is "count"."))
+    (stack-push! dbstk dbdat)
     dbdat))
 
 ;; set up a subdb
 ;;
 (define (dbfile:init-subdb dbstruct run-id init-proc)
@@ -326,21 +358,21 @@
 (define (dbfile:print-err . params)
   (with-output-to-port
       (current-error-port)
     (lambda ()
       (apply print params))))
-    
+
 (define (dbfile:cautious-open-database fname init-proc sync-mode journal-mode #!optional (tries-left 500))
-  (let* ((busy-file  (conc fname"-journal"))
+  (let* ((busy-file  (conc fname "-journal"))
 	 (delay-time (* (- 51 tries-left) 1.1))
       	 (write-access (file-write-access? fname))
          (dir-access (file-write-access? (pathname-directory fname)))
          (retry      (lambda ()
 		       (thread-sleep! delay-time)
 		       (if (> tries-left 0)
 			   (dbfile:cautious-open-database fname init-proc
-							  sync-mode: sync-mode journal-mode: journal-mode
+							  sync-mode journal-mode
 							  (- tries-left 1))))))
     (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up."))
     
     (if (and (file-write-access? fname)
 	     (file-exists? busy-file))
@@ -351,11 +383,11 @@
 	  (thread-sleep! 1)
 	  (if (eq? tries-left 2)
 	      (begin
 	  	(dbfile:print-err "INFO: forcing journal rollup "busy-file)
 	  	(dbfile:brute-force-salvage-db fname)))
-	  (dbfile:cautious-open-database fname init-proc sync-mode: sync-mode journal-mode: journal-mode (- tries-left 1)))
+	  (dbfile:cautious-open-database fname init-proc sync-mode journal-mode (- tries-left 1)))
 	
 	(let* ((result (condition-case
 		        (if dir-access
 			    (dbfile:with-simple-file-lock
 			     (conc fname ".lock")
@@ -402,37 +434,81 @@
 		    "cp "backupfname" "fname)))
     (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n"
 		      "  "cmd)
     (system cmd)))
 
+;; opens and returns handle and nothing else
+;;
+;; NOTE: this is already protected by mutex *no-sync-db-mutex*
+;;
+(define (dbfile:raw-open-no-sync-db dbpath)
+  (if (not (file-exists? dbpath))
+      (create-directory dbpath #t))
+  (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db")
+  (let* ((dbname    (conc dbpath "/no-sync.db"))
+	 (db-exists (file-exists? dbname))
+	 (init-proc (lambda (db)
+		      (sqlite3:with-transaction
+		       db
+		       (lambda ()
+			 ;; I have been having trouble with init of no-sync.db so
+			 ;; doing the init in a transaction every time (no gating
+			 ;; on file existance.
+			  (for-each
+			   (lambda (stmt)
+			     (sqlite3:execute db stmt))
+			   (list
+			    "CREATE TABLE IF NOT EXISTS no_sync_metadat
+                                (var TEXT,
+                                 val TEXT,
+                                   CONSTRAINT no_sync_metadat_constraint UNIQUE (var));"
+			    "CREATE TABLE IF NOT EXISTS no_sync_locks 
+                                (key TEXT,
+                                 val TEXT,
+                                   CONSTRAINT no_sync_metadat_constraint UNIQUE (key));"))))))
+	 (on-tmp      (equal? (car (string-split dbpath "/")) "tmp"))
+	 (db        (if on-tmp
+			(dbfile:cautious-open-database dbname init-proc 0 "WAL")
+			(dbfile:cautious-open-database dbname init-proc 0 #f)
+			;; (sqlite3:open-database dbname)
+			)))
+    (if on-tmp	      ;; done in cautious-open-database
+	(begin
+	  (sqlite3:execute db "PRAGMA synchronous = 0;")
+	  (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))))
+    db))
+
+(define (dbfile:with-no-sync-db dbpath proc)
+  (mutex-lock! *no-sync-db-mutex*)
+  (let* ((already-open *no-sync-db*)
+	 (db  (or already-open (dbfile:raw-open-no-sync-db dbpath)))
+	 (res (proc db)))
+    (if (not already-open)
+	(sqlite3:finalize! db))
+    (mutex-unlock! *no-sync-db-mutex*)
+    res))
+
+(define *no-sync-db-mutex* (make-mutex))
 
 (define (dbfile:open-no-sync-db dbpath)
-  (if *no-sync-db*
-      *no-sync-db*
-      (begin
-	(if (not (file-exists? dbpath))
-	    (create-directory dbpath #t))
-	(let* ((dbname    (conc dbpath "/no-sync.db"))
-	       (db-exists (file-exists? dbname))
-	       (init-proc (lambda (db)
-			    (if (not db-exists)
-				(begin
-				  (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));"))
-				)))
-	       (db        (dbfile:cautious-open-database dbname init-proc 0 "WAL"))) ;; (sqlite3:open-database dbname)))
-	  ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
-	  ;; (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) ;; done in cautious-open-database
-	  (set! *no-sync-db* db)
-	  db))))
+  (mutex-lock! *no-sync-db-mutex*)
+  (let* ((res (if *no-sync-db*
+		  *no-sync-db*
+		  (let* ((db (dbfile:raw-open-no-sync-db dbpath)))
+		    (set! *no-sync-db* db)
+		    db))))
+    (mutex-unlock! *no-sync-db-mutex*)
+    res))
 
 (define (db:no-sync-set db var val)
   (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val))
 
 (define (db:no-sync-del! db var)
   (sqlite3:execute db "DELETE FROM no_sync_metadat WHERE var=?;" var))
 
 (define (db:no-sync-get/default db var default)
+  (assert (sqlite3:database? db) "FATAL: db:no-sync-get/default called with a bad db handle:" db)
   (let ((res default))
     (sqlite3:for-each-row
      (lambda (val)
        (set! res val))
      db
@@ -445,10 +521,72 @@
           (if newres
               newres
               res))
         res)))
 
+;; timestring+identifier+payload
+;; locks are unique on identifier, payload is informational
+(define (db:extract-time-identifier instr)
+  (let ((tokens (string-split instr "+")))
+    (match tokens
+      ((t i)(cons (string->number t) i))
+      ((t)  (cons (string->number t) #f))
+      (else
+       (assert #f "FATAL: db:extract-time-identifier handed bad data "instr)))))
+
+;; transaction protected lock aquisition
+;; either:
+;;    fails    returns  (#f lock-creation-time identifier)
+;;    succeeds (returns (#t lock-creation-time identifier)
+;; use (db:no-sync-del! db keyname) to release the lock
+;;
+(define (db:no-sync-get-lock-with-id db keyname identifier)
+  (sqlite3:with-transaction
+   db
+   (lambda ()
+     (condition-case
+      (let* ((curr-val (db:no-sync-get/default db keyname #f)))
+	(if curr-val
+	    (match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier
+	       ((timestamp . ident)
+		(cons (equal? ident identifier) timestamp))
+	       (else (cons #f 'malformed-lock)))  ;; lock malformed
+	    (let ((curr-sec (current-seconds))
+		  (lock-value (if identifier
+				  (conc (current-seconds)"+"identifier)
+				  (current-seconds))))
+	      (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value)
+	      (cons #t curr-sec))))
+      (exn (io-error)  (dbfile:print-err "ERROR: i/o error with no-sync db. Check permissions, disk space etc. and try again."))
+      (exn (corrupt)   (dbfile:print-err "ERROR: database no-sync db is corrupt. Repair it to proceed."))
+      (exn (busy)      (dbfile:print-err "ERROR: database no-sync db is locked. Try copying to another location, remove original and copy back."))
+      (exn (permission)(dbfile:print-err "ERROR: database no-sync db has some permissions problem."))
+      (exn () ;; (status done) ;; I don't know how to detect status done but no data!
+	   (dbfile:print-err "ERROR: Unknown error with database no-sync db message: exn="(condition->list exn)", \n"
+			     ((condition-property-accessor 'exn 'message) exn))
+	   (cons #f #f))))))
+
+(define (db:no-sync-check-lock db keyname identifier)
+  (let* ((curr-val (db:no-sync-get/default db keyname #f)))
+    (match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier
+      ((timestamp . ident)
+       (cons (equal? ident identifier) ident))
+      (else  (cons #f 'no-lock)))))
+
+;; get the lock, wait 0.25 seconds and verify still have it.
+;; this should not be necessary given the use of transaction in
+;; db:no-sync-get-lock-with-id but it does seem to be needed
+;;
+(define (db:no-sync-lock-and-check db keyname identifier)
+  (let* ((result  (db:no-sync-get-lock-with-id db keyname identifier))
+	 (gotlock (car result)))
+    (if gotlock
+	(begin
+	  (thread-sleep! 0.25)
+	  (db:no-sync-check-lock db keyname identifier))
+	result)))
+    
 ;; transaction protected lock aquisition
 ;; either:
 ;;    fails    returns  (#f . lock-creation-time)
 ;;    succeeds (returns (#t . lock-creation-time)
 ;; use (db:no-sync-del! db keyname) to release the lock
@@ -513,11 +651,12 @@
 	  (db:sync-touched dbstruct runid keys dbinit)
 	  (set! *db-sync-in-progress* #f)
 	  (delete-file* lock-file)
 	  #t)
         (begin
-          (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress.")
+	  (if (common:low-noise-print 120 (conc "no lock "from-db-file))
+	      (dbfile:print-err "INFO: could not get lock for " from-db-file ", sync likely in progress."))
 	  #f
 	  ))))
 
 ;; ;; Get a lock from the no-sync-db for the from-db, then delta sync the from-db to the to-db, otherwise return #f
 ;; ;;
@@ -556,11 +695,11 @@
          (tmpdb     (db:open-db dbstruct run-id dbinit)) ;; sqlite3-db tmpdbfile #f))
 	 (start-t   (current-seconds)))
     (mutex-lock! *db-multi-sync-mutex*)
     (let ((update_info (cons "last_update" (if force-sync 0 *db-last-sync*) )))
       (mutex-unlock! *db-multi-sync-mutex*)
-      (db:sync-tables (db:sync-all-tables-list dbstruct keys) update_info tmpdb mtdb))
+      (db:sync-tables (db:sync-all-tables-list keys) update_info tmpdb mtdb))
     (mutex-lock! *db-multi-sync-mutex*)
     (set! *db-last-sync* start-t)
     (set! *db-last-access* start-t)
     (mutex-unlock! *db-multi-sync-mutex*)
     (dbfile:add-dbdat dbstruct run-id tmpdb)
@@ -619,12 +758,12 @@
 	 '("type"           #f)
          '("last_update"    #f))))
 
 ;; needs db to get keys, this is for syncing all tables
 ;;
-(define (db:sync-main-list dbstruct keys)
-  (let ((keys  keys)) ;; (db:get-keys dbstruct)))
+(define (db:sync-main-list keys)
+  (let ((keys  keys))
     (list
      (list "keys"
 	   '("id"        #f)
 	   '("fieldname" #f)
 	   '("fieldtype" #f))
@@ -658,14 +797,29 @@
 	   '("reviewed"       #f)
 	   '("iterated"       #f)
 	   '("avg_runtime"    #f)
 	   '("avg_disk"       #f)
 	   '("tags"           #f)
-	   '("jobgroup"       #f)))))
+	   '("jobgroup"       #f))
+
+
+     (list "tasks_queue"
+           '("id"            #f)
+           '("action"        #f)
+           '("owner"         #f) 
+           '("state"         #f)
+           '("target"        #f)
+           '("name"          #f)
+           '("testpatt"      #f)
+           '("keylock"       #f)
+           '("params"        #f)
+           '("creation_time" #f)
+           '("execution_time" #f))
+     )))
 
-(define (db:sync-all-tables-list dbstruct keys)
-  (append (db:sync-main-list dbstruct keys)
+(define (db:sync-all-tables-list keys)
+  (append (db:sync-main-list keys)
 	  db:sync-tests-only))
 
 ;; tbls is ( ("tablename" ( "field1" [#f|proc1] ) ( "field2" [#f|proc2] ) .... ) )
 ;; db's are dbdat's
 ;;
@@ -785,22 +939,27 @@
 	     fields)
 
 	    ;; read the source table
             ;; store a list of all rows in the table in fromdat, up to batch-len.
             ;; Then add fromdat to the fromdats list, clear fromdat and repeat.
-	    (sqlite3:for-each-row
-	     (lambda (a . b)
-	       (set! fromdat (cons (apply vector a b) fromdat))
-	       (if (> (length fromdat) batch-len)
-		   (begin
-		     (set! fromdats (cons fromdat fromdats))
-		     (set! fromdat  '())
-		     (set! totrecords (+ totrecords 1)))
-               )
-             )
-	     (dbr:dbdat-dbh fromdb)
-	     full-sel)
+            (sqlite3:with-transaction 
+              (dbr:dbdat-dbh fromdb)
+              (lambda ()
+	        (sqlite3:for-each-row
+	          (lambda (a . b)
+	            (set! fromdat (cons (apply vector a b) fromdat))
+	            (if (> (length fromdat) batch-len)
+		      (begin
+		        (set! fromdats (cons fromdat fromdats))
+		        (set! fromdat  '())
+		        (set! totrecords (+ totrecords 1)))
+                    )
+                 )
+	         (dbr:dbdat-dbh fromdb)
+	         full-sel)
+              )
+            )
 
              ;; Count less than batch-len as a record
              (if (> (length fromdat) 0)
                  (set! totrecords (+ totrecords 1)))
 
@@ -870,11 +1029,12 @@
           )
         )
 	tbls)
        (let* ((runtime      (- (current-milliseconds) start-time))
 	      (should-print (or ;; (debug:debug-mode 12)
-				(common:low-noise-print 120 "db sync" (> runtime 500))))) ;; low and high sync times treated as separate.
+			     (common:low-noise-print 120 "db sync")
+			     (> runtime 500)))) ;; low and high sync times treated as separate.
 	 (for-each 
 	  (lambda (dat)
 	    (let ((tblname (car dat))
 		  (count   (cdr dat)))
 	      (set! tot-count (+ tot-count count))
@@ -915,30 +1075,10 @@
                              FOR EACH ROW
                                BEGIN 
                                  UPDATE test_data SET last_update=(strftime('%s','now'))
                                    WHERE id=old.id;
                                END;" )))
-;;
-;; ADD run-id SUPPORT
-;;
-(define (db:create-all-triggers dbstruct)
-  (db:with-db
-   dbstruct #f #f
-   (lambda (dbdat db)
-     (db:create-triggers db))))
-
-(define (db:create-triggers db)
-    (for-each (lambda (key)
-              (sqlite3:execute db (cadr key)))
-          db:trigger-list))
-
-(define (db:drop-all-triggers dbstruct)
-  (db:with-db
-   dbstruct #f #f
-   (lambda (dbdat db)
-     (db:drop-triggers db))))
-
 (define (db:is-trigger-dropped db tbl-name)
   (let* ((trigger-name (if (equal? tbl-name "test_steps")
 			   "update_teststeps_trigger" 
                            (conc "update_" tbl-name "_trigger")))
 	 (res          #f))
@@ -988,58 +1128,144 @@
     ;; (mutex-unlock! *db-open-mutex*)
     dbdat))
 
 (define dbfile:db-init-proc (make-parameter #f))
 
-;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;")
-;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no
+;; in xmaxima this gives a curve close to what I want:
+;;    plot2d ((exp(x/1.2)-1)/300, [x, 0, 10])$
+;;    plot2d ((exp(x/1.5)-1)/40, [x, 0, 10])$
+;;    plot2d ((exp(x/5)-1)/40, [x, 0, 20])$
+(define (dbfile:droop x)
+  (/ (- (exp (/ x 5)) 1) 40))
+  ;; (* numqrys (/ 1 (qif-slope))))
+
+;; create a dropping near the db file in a qif dir
+;; use count of such files to gate queries (queries in flight)
 ;;
-(define (db:with-db dbstruct run-id r/w proc . params)
-  (let* ((use-mutex (> *api-process-request-count* 25)) ;; risk of db corruption
-	 (have-struct (dbr:dbstruct? dbstruct))
-         (dbdat     (if have-struct                ;; this stuff just allows us to call with a db handle directly
-			(db:open-db dbstruct run-id (dbfile:db-init-proc)) ;; (dbfile:get-subdb dbstruct run-id)
-			#f))
-	 (db        (if have-struct                ;; this stuff just allows us to call with a db handle directly
-			(dbr:dbdat-dbh dbdat)
-			dbstruct))
-	 (fname     (if dbdat
-			(dbr:dbdat-dbfile dbdat)
-			"nofilenameavailable"))
-	 (jfile     (conc fname"-journal"))
-	 #;(subdb     (if have-struct
-			(dbfile:get-subdb dbstruct run-id)
-			#f))
-	 ) ;; was 25
-    (if (file-exists? jfile)
-	(begin
-	  (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load")
-	  (thread-sleep! 0.2)))
-    (if (and use-mutex
-	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
-	(dbfile:print-err *api-process-request-count* " parallel api requests being processed in process "
-			  (current-process-id))) ;;  ", throttling access"))
-    (condition-case
-	(begin
-	  (if use-mutex (mutex-lock! *db-with-db-mutex*))
-	  (let ((res (apply proc dbdat db params))) ;; the actual call is here.
-	    (if use-mutex (mutex-unlock! *db-with-db-mutex*))
-	    ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
-	    (if dbdat
-		(dbfile:add-dbdat dbstruct run-id dbdat))
-	    res))
-      (exn (io-error)
-	   (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))
-      (exn (corrupt)
-	   (db:generic-error-printout exn "ERROR: database " fname " is corrupt. Repair it to proceed."))
-      (exn (busy)
-	   (db:generic-error-printout exn "ERROR: database " fname
-				      " is locked. Try copying to another location, remove original and copy back."))
-      (exn (permission)(db:generic-error-printout exn "ERROR: database " fname " has some permissions problem."))
-      (exn ()
-	   (db:generic-error-printout exn "ERROR: Unknown error with database " fname " message: "
-				      ((condition-property-accessor 'exn 'message) exn))))))
+(define (dbfile:wait-for-qif fname run-id params)
+  (let* ((thedir  (pathname-directory fname))
+	 (dbnum   (dbfile:run-id->dbnum run-id))
+	 (destdir (conc thedir"/qif-"dbnum))
+	 (uniqn   (get-area-path-signature (conc dbnum params)))
+	 (crumbn  (conc destdir"/"(current-seconds)"-"uniqn"."(current-process-id))))
+    (if (not (file-exists? destdir))(create-directory (conc destdir"/attic") #t))
+    (let loop ((count 0))
+      (let* ((currlks (glob (conc destdir"/*")))
+	     (numqrys (length currlks))
+	     (delayval (cond ;; do a droopish curve
+			((> numqrys 25)
+			 (for-each
+			  (lambda (f)
+			    (if (> (- (current-seconds)
+				      (handle-exceptions
+					  exn
+					(current-seconds) ;; file is likely gone, just fake out
+					(file-modification-time f)))
+				   (keep-age-param))
+				(let* ((basedir (pathname-directory f))
+				       (filen   (pathname-file f))
+				       (destf   (conc basedir"/attic/"filen)))
+				  (dbfile:print-err "Moving qif file "f" older than 10 seconds to "destf)
+				  ;; (delete-file* f)
+				  (handle-exceptions
+				      exn
+				    #t
+				    (file-move f destf #t)))))
+			  currlks)
+			 4)
+			((> numqrys 0)  (dbfile:droop numqrys)) ;; slope of 1/100
+			(else #f))))
+	(if (and delayval
+		 (< count 5))
+	    (begin
+	      (thread-sleep! delayval)
+	      (loop (+ count 1))))))
+    (with-output-to-file crumbn
+      (lambda ()
+	(print fname" run-id="run-id" params="params)
+	))
+    crumbn))
+
+;; ;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;")
+;; ;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no
+;; ;;
+;; ;; Used only with http - to be removed
+;; ;;
+;; (define (dbfile:with-db dbstruct run-id r/w proc params)
+;;   (assert dbstruct "FATAL: db:with-db called with dbstruct "#f)
+;;   (assert (dbr:dbstruct? dbstruct) "FATAL: dbstruct is "dbstruct)
+;;   ;; Testing 2023, March 14th. I went from full time use of the mutext to no use at all and
+;;   ;; didn't see much change in the frequency of the messages:
+;;   ;; Warning (#<thread: thread14974>): in thread: (bind!) bad parameter or other API misuse
+;;   ;; allowing request count to go up to 1000 and other crashes showed up:
+;;   ;; Warning (#<thread: thread1889>): in thread: (deserialize) unexpected end of input: #<input port "(tcp)">
+;;   ;;
+;;   ;; leave it fully on for now, test later if there is a performance issue
+;;   ;;
+;;   (let* ((use-mutex   #t) ;;(> *api-process-request-count* 50)) ;; risk of db corruption
+;; 	 (have-struct (dbr:dbstruct? dbstruct))
+;;          (dbdat       (if have-struct                ;; this stuff just allows us to call with a db handle directly
+;; 			  (db:open-db dbstruct run-id (dbfile:db-init-proc)) ;; (dbfile:get-subdb dbstruct run-id)
+;; 			  #f))
+;; 	 (db          (if have-struct                ;; this stuff just allows us to call with a db handle directly
+;; 			  (dbr:dbdat-dbh dbdat)
+;; 			  dbstruct))
+;; 	 (fname       (if dbdat
+;; 			  (dbr:dbdat-dbfile dbdat)
+;; 			  "nofilenameavailable"))
+;; 	 (jfile       (conc fname"-journal"))
+;; 	 (qryproc     (lambda ()
+;; 			(if use-mutex (mutex-lock! *db-with-db-mutex*))
+;; 			(let ((res (apply proc dbdat db params))) ;; the actual call is here.
+;; 			  (if use-mutex (mutex-unlock! *db-with-db-mutex*))
+;; 			  ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
+;; 			  (if dbdat
+;; 			      (dbfile:add-dbdat dbstruct run-id dbdat))
+;; 			  ;; (delete-file* crumbfile)
+;; 			  res)))
+;; 	 (stop-train  (conc (dbr:dbstruct-areapath dbstruct)"/stop-the-train")))
+;; 
+;;     (assert (sqlite3:database? db) "FATAL: db:with-db, db is not a database, db="db
+;; 	    ", fname="fname)
+;;     (if (file-exists? jfile)
+;; 	(begin
+;; 	  (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load")
+;; 	  (thread-sleep! 0.2)))
+;;     (if (and use-mutex
+;; 	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
+;; 	(dbfile:print-err *api-process-request-count*
+;; 			  " parallel api requests being processed in process "
+;; 			  (current-process-id))) ;;  ", throttling access"))
+;;     (case (no-condition-db-with-db)
+;;       ((production)(qryproc))
+;;       ((suicide-mode)
+;;        (handle-exceptions
+;; 	exn
+;; 	(with-output-to-file stop-train
+;; 	  (lambda ()
+;; 	    (db:generic-error-printout exn "Stop train mode, run-id: "run-id
+;; 				       " params: "params" proc: "proc)))
+;; 	(qryproc)))
+;;       (else
+;;        (condition-case
+;; 	(qryproc)
+;; 	(exn (io-error)
+;; 	     (db:generic-error-printout exn "ERROR: i/o error with "fname
+;; 					". Check permissions, disk space etc. and try again."))
+;; 	(exn (corrupt)
+;; 	     (db:generic-error-printout exn "ERROR: database "fname
+;; 					" is corrupt. Repair it to proceed."))
+;; 	(exn (busy)
+;; 	     (db:generic-error-printout exn "ERROR: database "fname
+;; 					" is locked. Try copying to another location,"
+;; 					" remove original and copy back."))
+;; 	(exn (permission)(db:generic-error-printout exn "ERROR: database "fname
+;; 						    " has some permissions problem."))
+;; 	(exn ()
+;; 	     (db:generic-error-printout exn "ERROR: Unknown error with database "fname
+;; 					" message: "
+;; 					((condition-property-accessor 'exn 'message) exn))))))))
 
 ;;======================================================================
 ;; another attempt at a transactionized queue
 ;;======================================================================
 
@@ -1112,34 +1338,45 @@
 (define (dbfile:simple-file-lock fname #!key (expire-time 300))
   (let ((fmod-time (handle-exceptions
 		       ext
 		     (current-seconds)
 		     (file-modification-time fname))))
+
+    ;; if the file exists, if it has expired, delete it and call this function recursively.
     (if (file-exists? fname)
 	(if (> (- (current-seconds) fmod-time) expire-time)
 	    (begin
+              (dbfile:print-err "simple-file-lock: removing expired file: " fname)
 	      (handle-exceptions exn #f (delete-file* fname))	
 	      (dbfile:simple-file-lock fname expire-time: expire-time))
-	    #f)
-	(let ((key-string (conc (get-host-name) "-" (current-process-id)))
+	    #f
+        )
+
+        ;; If it doesn't exist, write the host name and process id to the file
+	(let ((key-string (conc (get-host-name) "-" (current-process-id) ": " (argv)))
 	      (oup        (open-output-file fname)))
 	  (with-output-to-port
 	      oup
 	    (lambda ()
 	      (print key-string)))
 	  (close-output-port oup)
-	  #;(with-output-to-file fname ;; bizarre. with-output-to-file does not seem to be cleaning up after itself.
-	    (lambda ()
-	  (print key-string)))
+
+
+          ;; sleep 3 seconds and make sure it still exists and contains the same host/process id.
+          ;; if not, return #f
 	  (thread-sleep! 0.25)
 	  (if (file-exists? fname)
 	      (handle-exceptions exn
                 #f 
                 (with-input-from-file fname
 	  	  (lambda ()
 		    (equal? key-string (read-line)))))
-	      #f)
+              (begin
+                 (dbfile:print-err "dbfile:simple-file-lock created " fname " but it was gone 3 seconds later")
+	         #f
+              )
+          )
        )
     )
   )
 )
 
@@ -1159,13 +1396,44 @@
       exn
       #f ;; I don't really care why this failed (at least for now)
     (delete-file* fname)))
 
 (define (dbfile:with-simple-file-lock fname proc #!key (expire-time 300))
-  (let ((gotlock (dbfile:simple-file-lock-and-wait fname expire-time: expire-time)))
+  (let ((start-time (current-seconds))
+        (gotlock (dbfile:simple-file-lock-and-wait fname expire-time: expire-time))
+        (end-time (current-seconds))
+        )
     (if gotlock
 	(let ((res (proc)))
 	  (dbfile:simple-file-release-lock fname)
 	  res)
-	(assert #t "FATAL: simple file lock never got a lock."))))
-  
+        (begin
+          (dbfile:print-err "dbfile:with-simple-file-lock: " fname " is locked by "
+			    (with-input-from-file fname
+			      (lambda ()
+				(dbfile:print-err (read-line)))))
+          (dbfile:print-err "wait time = " (- end-time start-time))
+	  (dbfile:print-err "ERROR: simple file lock could not get a lock for " fname " in " expire-time " seconds")
+          #f
+        )
+    )
+  )
+)
+
+
+(define *get-cache-stmth-mutex* (make-mutex))
+
+(define (db:get-cache-stmth dbdat db stmt)
+  (mutex-lock! *get-cache-stmth-mutex*)
+  (let* (;; (dbdat       (dbfile:get-dbdat dbstruct run-id))
+	 (stmt-cache  (dbr:dbdat-stmt-cache dbdat))
+	 ;; (stmth       (db:hoh-get stmt-cache db stmt))
+	 (stmth       (hash-table-ref/default stmt-cache stmt #f))
+	 (result      (or stmth
+			  (let* ((newstmth (sqlite3:prepare db stmt)))
+			    ;; (db:hoh-set! stmt-cache db stmt newstmth)
+			    (hash-table-set! stmt-cache stmt newstmth)
+			    newstmth))))
+    (mutex-unlock! *get-cache-stmth-mutex*)
+    result))
+
 )

Index: dbmod.scm
==================================================================
--- dbmod.scm
+++ dbmod.scm
@@ -1,5 +1,6 @@
+
 ;;======================================================================
 ;; Copyright 2017, Matthew Welland.
 ;; 
 ;; This file is part of Megatest.
 ;; 
@@ -17,40 +18,801 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
 (declare (unit dbmod))
+(declare (uses dbfile))
+(declare (uses commonmod))
+(declare (uses debugprint))
 
 (module dbmod
 	*
 	
-(import scheme chicken data-structures extras)
-(import (prefix sqlite3 sqlite3:)
-	posix typed-records srfi-18
-	srfi-69)
-
-(define (db:run-id->dbname run-id)
-  (cond
-   ((number? run-id)(conc run-id ".db"))
-   ((not run-id)    "main.db")
-   (else            run-id)))
-
-
-;;======================================================================
-;; hash of hashs
-;;======================================================================
-
-
-(define (db:hoh-set! dat key1 key2 val)
-  (let* ((subhash (hash-table-ref/default dat key1 #f)))
-    (if subhash
-	(hash-table-set! subhash key2 val)
-	(begin
-	  (hash-table-set! dat key1 (make-hash-table))
-	  (db:hoh-set! dat key1 key2 val)))))
-
-(define (db:hoh-get dat key1 key2)
-  (let* ((subhash (hash-table-ref/default dat key1 #f)))
-    (and subhash
-	 (hash-table-ref/default subhash key2 #f))))
-
+(import scheme
+	chicken
+	data-structures
+	extras
+	files
+
+	(prefix sqlite3 sqlite3:)
+	matchable
+	posix
+	typed-records
+	srfi-1
+	srfi-18
+	srfi-69
+
+	commonmod
+	dbfile
+	debugprint
+	)
+
+;; NOTE: This returns only the name "1.db", "main.db", not the path
+;;
+(define (dbmod:run-id->dbfname run-id)
+  (conc (dbfile:run-id->dbnum run-id)".db"))
+
+(define (dbmod:get-dbdir dbstruct)
+  (let* ((areapath (dbr:dbstruct-areapath dbstruct))
+	 (dbdir    (conc areapath"/.mtdb")))
+    (if (and (file-write-access? areapath)
+	     (not (file-exists? dbdir)))
+	(create-directory dbdir))
+    dbdir))
+
+(define (dbmod:run-id->full-dbfname dbstruct run-id)
+  (conc (dbmod:get-dbdir dbstruct
+
+			 run-id
+
+			 )"/"(dbmod:run-id->dbfname run-id)))
+
+;;======================================================================
+;; Read-only cachedb cached direct from disk method
+;;======================================================================
+
+(define *dbmod:nfs-db-handles* (make-hash-table)) ;; dbfname -> dbstruct
+
+;; called in rmt.scm nfs-transport-handler
+(define (dbmod:nfs-get-dbstruct run-id keys init-proc areapath)
+  (assert areapath "FATAL: dbmod:nfs-get-dbstruct called without areapath set.")
+  (let* ((dbfname  (dbmod:run-id->dbfname run-id))
+	 (dbstruct (hash-table-ref/default *dbmod:nfs-db-handles* dbfname #f)))
+    (if dbstruct
+	dbstruct
+	(let* ((newdbstruct (dbmod:open-dbmoddb areapath run-id dbfname init-proc keys syncdir: 'fromdisk)))
+	  (hash-table-set! *dbmod:nfs-db-handles* dbfname newdbstruct)
+	  newdbstruct))))
+
+;;======================================================================
+;; The cachedb one-db file per server method goes in here
+;;======================================================================
+
+;; NOTE: the r/w is now w/r, #t=db modified by query, #f=db NOT modified by query
+(define (dbmod:with-db dbstruct run-id w/r proc params)
+  (let* ((use-mutex (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk
+			     (> *api-process-request-count* 5)) ;; when writes are happening throttle more
+			(> *api-process-request-count* 50)))
+	 (dbdat     (dbmod:open-db dbstruct run-id (dbfile:db-init-proc)))
+	 (dbh       (dbr:dbdat-dbh dbdat)) ;; this will be the cachedb handle
+	 (dbfile    (dbr:dbdat-dbfile dbdat)))
+    ;; if nfs mode do a sync if delta > 2
+    (let* ((last-update (dbr:dbstruct-last-update dbstruct))
+	   (sync-proc   (dbr:dbstruct-sync-proc dbstruct))
+	   (curr-secs   (current-seconds)))
+      (if (and (not (eq? (dbfile:cache-method) 'none)) ;; used by dashboard, no need for sync
+	       (> (- curr-secs last-update) 5))
+	  (begin
+	    (sync-proc last-update)
+
+	    ;; MOVE THIS CALL TO INSIDE THE sync-proc CALL
+	    (dbr:dbstruct-last-update-set! dbstruct curr-secs)
+	    )))
+    (assert (sqlite3:database? dbh) "FATAL: bad db handle in dbmod:with-db") 
+    (if use-mutex (mutex-lock! *db-with-db-mutex*))
+    (let* ((res (apply proc dbdat dbh params)))
+      (if use-mutex (mutex-unlock! *db-with-db-mutex*))
+      res)))
+
+(define (db:with-db dbstruct run-id w/r proc . params)
+  (dbmod:with-db dbstruct run-id w/r proc params))
+
+;; 
+(define (dbmod:open-cachedb-db init-proc dbfullname)
+  (let* ((db      (if dbfullname
+		      (dbmod:safely-open-db dbfullname init-proc #t)
+		      (sqlite3:open-database ":memory:")))
+	 (handler (sqlite3:make-busy-timeout 3600)))
+    (sqlite3:set-busy-handler! db handler)
+    (init-proc db)
+    db))
+
+(define (dbmod:open-db dbstruct run-id dbinit)
+  (or (dbr:dbstruct-dbdat dbstruct)
+      (let* ((dbdat (make-dbr:dbdat
+		     dbfile: (dbr:dbstruct-dbfile dbstruct)
+		     dbh:    (dbr:dbstruct-cachedb  dbstruct)
+		     )))
+	(dbr:dbstruct-dbdat-set! dbstruct dbdat)
+	dbdat)))
+
+;; NOT USED?
+(define (dbmod:need-on-disk-db-handle)
+    (case (dbfile:cache-method)
+      ((none tmp) #t)
+      ((cachedb)
+       (case (dbfile:sync-method)
+	 ((original) #t)
+	 ((attach)   #t) ;; we need it to force creation of the on-disk file - FIXME
+	 (else
+	  (debug:print 0 *default-log-port* "Unknown dbfile:sync-method setting: "
+		       (dbfile:sync-method)))))
+      (else
+       (debug:print 0 *default-log-port* "Unknown dbfile:cache-method setting: "
+		    (dbfile:cache-method))
+       #f)))
+
+(define (dbmod:safely-open-db dbfullname init-proc write-access)
+  (dbfile:with-simple-file-lock
+   (conc dbfullname".lock")
+   (lambda ()
+     (let* ((dbexists (file-exists? dbfullname))
+	    (db       (sqlite3:open-database dbfullname))
+	    (handler  (sqlite3:make-busy-timeout 136000)))
+       (sqlite3:set-busy-handler! db handler)
+       (if (and dbexists
+		write-access)
+	   (init-proc db))
+       db))))
+
+(define *sync-in-progress* #f)
+
+;; Open the cachedb db and the on-disk db
+;; populate the cachedb db with data
+;;
+;; Updates fields in dbstruct
+;; Returns dbstruct
+;;
+;; * This routine creates the db if not found
+;; * Probably can get rid of the dbstruct-in
+;; 
+(define (dbmod:open-dbmoddb areapath run-id dbfname-in init-proc keys
+			    #!key (dbstruct-in #f)
+			    ;; (dbcontext 'megatest) ;; use dashboard to do the dashboard 
+			    (syncdir 'todisk)) ;; todisk is used when caching in /tmp and writing data back to MTRAH
+  (let* ((dbstruct     (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
+	 (dbfname      (or dbfname-in (dbmod:run-id->dbfname run-id)))
+	 (dbpath       (dbmod:get-dbdir dbstruct))             ;; directory where all the .db files are kept
+	 (dbfullname   (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
+	 (dbexists     (file-exists? dbfullname))
+	 (tmpdir       (dbfile:make-tmpdir-name areapath))
+	 (tmpdb        (let* ((fname (conc tmpdir"/"dbfname)))
+			 fname))
+	 (cachedb        (dbmod:open-cachedb-db init-proc
+					    ;; (if (eq? (dbfile:cache-method) 'cachedb)
+					    ;; 	#f
+					    tmpdb
+					    ;; )
+					    ))
+	 (write-access (file-write-access? dbpath))
+	 (db           (dbmod:safely-open-db dbfullname init-proc write-access))
+	 (tables       (db:sync-all-tables-list keys)))
+    (if (not (and (sqlite3:database? cachedb)
+		  (sqlite3:database? db)))
+	(begin
+	  (debug:print 0 *default-log-port* "ERROR: Failed to properly open "dbfname-in", exiting immediately.")
+	  (exit)))    ;; (assert (sqlite3:database? cachedb) "FATAL: open-dbmoddb: cachedb is not a db")
+    ;; (assert (sqlite3:database? db) "FATAL:  open-dbmoddb: db is not a db")
+    (dbr:dbstruct-cachedb-set!     dbstruct cachedb)
+    (dbr:dbstruct-ondiskdb-set!  dbstruct db)
+    (dbr:dbstruct-dbfile-set!    dbstruct dbfullname)
+    (dbr:dbstruct-dbtmpname-set! dbstruct tmpdb)
+    (dbr:dbstruct-dbfname-set!   dbstruct dbfname)
+    (dbr:dbstruct-sync-proc-set! dbstruct
+				 (lambda (last-update)
+				   (if *sync-in-progress*
+				       (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk")
+				       (let* ((sync-cmd (conc "megatest -db2db -from "tmpdb" -to "dbfullname"&"))
+					      (synclock-file     (conc dbfullname".lock"))
+					      (synclock-mod-time (if (file-exists? synclock-file)
+								     (handle-exceptions
+									 exn
+								       #f
+								       (file-modification-time synclock-file))
+								     #f))
+					      (thethread         (lambda ()
+								   (thread-start!
+								    (make-thread
+								     (lambda ()
+								       (set! *sync-in-progress* #t)
+								       (debug:print-info "Running "sync-cmd)
+								       (system sync-cmd)
+								       (set! *sync-in-progress* #f)))))))
+					 (if (< (file-modification-time tmpdb)
+						(file-modification-time dbfullname))
+					     (debug:print 0 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname)
+					     (if synclock-mod-time
+						 (if (< (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file
+						     (begin
+						       (handle-exceptions
+							   exn
+							 #f
+							 (delete-file synclock-file))
+						       (thethread))
+						     (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found."))
+						 (thethread)))))))
+    ;; (dbmod:sync-tables tables #f db cachedb)
+    ;; (if db
+    (dbmod:sync-gasket tables #f cachedb db dbfullname 'fromdest keys) ;; ) ;; load into cachedb
+    (dbr:dbstruct-last-update-set! dbstruct (current-seconds)) ;; should this be offset back in time by one second?
+    dbstruct))
+
+;;    (if (eq? syncdir 'todisk) ;; sync to disk normally, sync from in dashboard
+;;        (dbmod:sync-tables tables last-update cachedb db)
+;;        (dbmod:sync-tables tables last-update db cachedb))))
+;;
+;; direction: 'fromdest 'todest
+;;
+(define (dbmod:sync-gasket tables last-update cachedb dbh dbfname direction keys)
+  (assert (sqlite3:database? cachedb) "FATAL: sync-gasket: cachedb is not a db")
+  (assert (sqlite3:database? cachedb) "FATAL: sync-gasket: dbh is not a db")
+  (debug:print-info 0 *default-log-port* "dbmod:sync-gasket called with sync-method="(dbfile:sync-method))
+  (case (dbfile:sync-method)
+    ((none) #f)
+    ((attach)
+     (dbmod:attach-sync tables cachedb dbfname direction))
+    ((newsync)
+     (dbmod:new-sync tables cachedb dbh dbfname direction))
+    (else
+     (case direction
+       ((todisk) ;; i.e. from the cache db to the mtrah db
+	(dbmod:sync-tables tables last-update keys cachedb dbh))
+       (else
+	(dbmod:sync-tables tables last-update keys dbh cachedb))))))
+
+(define (dbmod:close-db dbstruct)
+  ;; do final sync to disk file
+  ;; (do-sync ...)
+  (sqlite3:finalize! (dbr:dbstruct-ondiskdb dbstruct)))
+
+;;======================================================================
+;; Sync db
+;;======================================================================
+
+(define (dbmod:calc-use-last-update has-last-update fields last-update)
+  (cond
+   ((and has-last-update
+	 (member "last_update" fields))
+    #t) ;; if given a number, just use it for all fields
+   ((number? last-update) #f) ;; if not matched first entry then ignore last-update for this table
+   ((and (pair? last-update)
+	 (member (car last-update)    ;; last-update field name
+		 (map car fields)))
+    #t)
+   ((and last-update (not (pair? last-update)) (not (number? last-update)))
+    (debug:print 0 *default-log-port* "ERROR: parameter last-update for db:sync-tables must be a pair or a number, received: " last-update);; found in fields
+    #f)
+   (else
+    #f)))
+
+;; tbls is ( ("tablename" ( "field1" [#f|proc1] ) ( "field2" [#f|proc2] ) .... ) )
+;; dbs are sqlite3 db handles
+;;
+;; if last-update specified ("field-name" . time-in-seconds)
+;;    then sync only records where field-name >= time-in-seconds
+;;    IFF field-name exists
+;;
+;; Use (db:sync-all-tables-list keys) to get the tbls input
+;;
+(define (dbmod:sync-tables tbls last-update keys fromdb todb)
+  (debug:print-info 2 *default-log-port* "dbmod:sync-tables called, from: "fromdb", to: "todb)
+  (assert (sqlite3:database? fromdb) "FATAL: dbmod:sync-tables called with fromdb not a database" fromdb)
+  (assert (sqlite3:database? todb) "FATAL: dbmod:sync-tables called with fromdb not a database" todb)
+  (let ((stmts       (make-hash-table)) ;; table-field => stmt
+	(all-stmts   '())              ;; ( ( stmt1 value1 ) ( stml2 value2 ))
+	(numrecs     (make-hash-table))
+	(start-time  (current-milliseconds))
+	(tot-count   0))
+    (for-each ;; table
+     (lambda (tabledat)
+       (let* ((tablename        (car tabledat))
+	      (fields           (cdr tabledat))
+	      (has-last-update  (member "last_update" fields))
+	      (use-last-update  (dbmod:calc-use-last-update has-last-update fields last-update))
+	      (last-update-value (if use-last-update ;; no need to check for has-last-update - it is already accounted for
+				     (if (number? last-update)
+					 last-update
+					 (cdr last-update))
+				     #f))
+	      (last-update-field (if use-last-update
+				     (if (number? last-update)
+					 "last_update"
+					 (car last-update))
+				     #f))
+	      (num-fields (length fields))
+	      (field->num (make-hash-table))
+	      (num->field (apply vector (map car fields))) ;; BBHERE
+	      (full-sel   (conc "SELECT " (string-intersperse (map car fields) ",") 
+				" FROM " tablename (if use-last-update ;; apply last-update criteria
+						       (conc " WHERE " last-update-field " >= " last-update-value)
+						       "")
+				";"))
+	      (full-ins   (conc "INSERT OR REPLACE INTO " tablename " ( " (string-intersperse (map car fields) ",") " ) "
+				" VALUES ( " (string-intersperse (make-list num-fields "?") ",") " );"))
+	      (fromdat    '())
+	      (fromdats   '())
+	      (totrecords 0)
+	      (batch-len  100) ;; (string->number (or (configf:lookup *configdat* "sync" "batchsize") "100")))
+	      (todat      (make-hash-table))
+	      (count      0)
+              (field-names (map car fields)))
+	 
+	 ;; set up the field->num table
+	 (for-each
+	  (lambda (field)
+	    (hash-table-set! field->num field count)
+	    (set! count (+ count 1)))
+	  fields)
+	 
+	 ;; read the source table
+         ;; store a list of all rows in the table in fromdat, up to batch-len.
+         ;; Then add fromdat to the fromdats list, clear fromdat and repeat.
+	 (sqlite3:for-each-row
+	  (lambda (a . b)
+	       (set! fromdat (cons (apply vector a b) fromdat))
+	       (if (> (length fromdat) batch-len)
+		   (begin
+		     (set! fromdats (cons fromdat fromdats))
+		     (set! fromdat  '())
+		     (set! totrecords (+ totrecords 1)))))
+	  fromdb
+	  full-sel)
+	 
+         ;; Count less than batch-len as a record
+         (if (> (length fromdat) 0)
+             (set! totrecords (+ totrecords 1)))
+	 
+	 ;; tack on remaining records in fromdat
+	 (if (not (null? fromdat))
+	     (set! fromdats (cons fromdat fromdats)))
+	 
+	 (sqlite3:for-each-row
+	  (lambda (a . b)
+	    (hash-table-set! todat a (apply vector a b)))
+	  todb
+	  full-sel)
+	 
+	 ;; first pass implementation, just insert all changed rows
+	 
+	 (let* ((db                 todb)
+                (drp-trigger        (if (member "last_update" field-names)
+					(db:drop-trigger db tablename) 
+					#f))
+		(has-last-update    (member "last_update" field-names))
+                (is-trigger-dropped (if has-last-update
+                                        (db:is-trigger-dropped db tablename)
+					#f)) 
+		(stmth  (sqlite3:prepare db full-ins))
+                (changed-rows 0))
+	   (for-each
+	    (lambda (fromdat-lst)
+	      (mutex-lock! *db-transaction-mutex*)
+	      (sqlite3:with-transaction
+	       db
+	       (lambda ()
+		 (for-each ;; 
+		  (lambda (fromrow)
+		    (let* ((a    (vector-ref fromrow 0))
+			   (curr (hash-table-ref/default todat a #f))
+			   (same #t))
+		      (let loop ((i 0))
+			(if (or (not curr)
+				(not (equal? (vector-ref fromrow i)(vector-ref curr i))))
+			    (set! same #f))
+			(if (and same
+				 (< i (- num-fields 1)))
+			    (loop (+ i 1))))
+		      (if (not same)
+			  (begin
+			    (apply sqlite3:execute stmth (vector->list fromrow))
+			    (hash-table-set! numrecs tablename (+ 1 (hash-table-ref/default numrecs tablename 0)))
+			    (set! changed-rows (+ changed-rows 1))))))
+		  fromdat-lst)))
+	      (mutex-unlock! *db-transaction-mutex*))
+	    fromdats)
+		       ;; (debug:print 0 *default-log-port* "row="row)
+	   
+	   (sqlite3:finalize! stmth)
+           (if (member "last_update" field-names)
+               (db:create-trigger db tablename)))
+	 ))
+     tbls)
+    (let* ((runtime      (- (current-milliseconds) start-time))
+	   (should-print (or ;; (debug:debug-mode 12)
+			  (common:low-noise-print 120 "db sync")
+			  (> runtime 500)))) ;; low and high sync times treated as separate.
+      (for-each 
+       (lambda (dat)
+	 (let ((tblname (car dat))
+	       (count   (cdr dat)))
+	   (set! tot-count (+ tot-count count)))) 
+       (sort (hash-table->alist numrecs)(lambda (a b)(> (cdr a)(cdr b))))))
+    tot-count))
+
+(define (has-last-update dbh tablename)
+  (let* ((has-last #f))
+    (sqlite3:for-each-row
+     (lambda (name)
+       (if (equal? name "last_update")
+	   (set! has-last #t)))
+     dbh
+     (conc "SELECT name FROM pragma_table_info('"tablename"') as tblInfo;"))
+    has-last))
+
+;; tbls is ( ("tablename" ( "field1" [#f|proc1] ) ( "field2" [#f|proc2] ) .... ) )
+;;
+;; direction = fromdest, todest
+;; mode = 'full, 'incr
+;;
+;; Idea: youngest in dest is last_update time
+;;
+(define (dbmod:attach-sync tables dbh destdbfile direction #!key
+			   (mode 'full)
+			   (no-update '("keys")) ;; do
+			   )
+  (debug:print 0 *default-log-port* "Doing sync "direction" "destdbfile)
+  (if (not (sqlite3:auto-committing? dbh))
+      (debug:print 0 *default-log-port* "Skipping sync due to transaction in flight.")
+      (let* ((table-names  (map car tables))
+	     (dest-exists  (file-exists? destdbfile)))
+	(assert dest-exists "FATAL: sync called with non-existant file, "destdbfile)
+	;; attach the destdbfile
+	;; for each table
+	;;    insert into dest.<table> select * from src.<table> where last_update>last_update
+	;; done
+	(debug:print 0 *default-log-port* "Attaching "destdbfile" as auxdb")
+	(sqlite3:execute dbh (conc "ATTACH '"destdbfile"' AS auxdb;"))
+	(for-each
+	 (lambda (table)
+	   (let* ((tbldat (alist-ref table tables equal?))
+		  (fields (map car tbldat))
+		  (no-id-fields (filter (lambda (x)(not (equal? x "id"))) fields))
+		  (fields-str (string-intersperse fields ","))
+		  (no-id-fields-str (string-intersperse no-id-fields ","))
+		  (dir    (eq? direction 'todest))
+		  (fromdb (if dir "" "auxdb."))
+		  (todb   (if dir "auxdb." ""))
+		  (set-str (string-intersperse
+			    (map (lambda (field)
+				   (conc fromdb field"="todb field))
+				 fields)
+			    ","))
+		  (stmt1 (conc "INSERT OR IGNORE INTO "todb table
+			       " SELECT * FROM "fromdb table";"))
+		  (stmt8 (conc "UPDATE "todb table" SET ("no-id-fields-str") = (SELECT "no-id-fields-str" FROM "fromdb table" WHERE "todb table".id="fromdb table".id"
+			       (if (member "last_update" fields)
+				   (conc " AND "fromdb table".last_update > "todb table".last_update);")
+				   ");")))
+		  (start-ms (current-milliseconds)))
+	     ;; (debug:print 0 *default-log-port* "stmt8="stmt8)
+	     ;; (if (sqlite3:auto-committing? dbh)
+	     ;; (begin
+	     (mutex-lock! *db-transaction-mutex*)
+	     (sqlite3:with-transaction
+	      dbh
+	      (lambda ()
+		(debug:print-info 0 *default-log-port* "Sync from "fromdb table" to "todb table" using "stmt1)
+		(sqlite3:execute dbh stmt1)    ;; get all new rows
+		
+		#;(if (member "last_update" fields)
+		(sqlite3:execute dbh stmt8))    ;; get all updated rows
+		;; (sqlite3:execute dbh stmt5)
+		;; (sqlite3:execute dbh stmt4) ;; if it worked this would be better for incremental up
+		;; (sqlite3:execute dbh stmt6)
+		))
+	     (debug:print 0 *default-log-port* "Synced table "table
+			  " in "(- (current-milliseconds) start-ms)"ms") ;; )
+	     (mutex-unlock! *db-transaction-mutex*)))
+	     
+	 ;; (debug:print 0 *default-log-port* "Skipping sync of table "table" due to transaction in flight."))))
+	 table-names)
+	(sqlite3:execute dbh "DETACH auxdb;"))))
+
+;; FAILED ATTEMPTS
+
+	     ;; (if (not (has-last-update dbh table))
+	     ;;     (sqlite3:execute dbh (conc "ALTER TABLE "table" ADD COLUMN last_update INTEGER;")))
+	     ;; (if (not (has-last-update dbh (conc "auxdb."table)))
+	     ;;     (sqlite3:execute dbh (conc "ALTER TABLE auxdb."table" ADD COLUMN last_update INTEGER;")))
+
+                ;; (stmt2 (conc "INSERT OR REPLACE INTO "todb table
+		;; 	       " SELECT * FROM "fromdb table" WHERE "
+		;; 	       fromdb table".last_update > "
+		;; 	       todb table".last_update;"))
+		;; (stmt3 (conc "INSERT OR REPLACE INTO "todb"."table
+		;; 	       " SELECT * FROM "fromdb table";"))
+		;; (stmt4 (conc "DELETE FROM "todb table" WHERE "fromdb
+		;; 	       table ".last_update > "todb table".last_update;"))
+		;; (stmt5 (conc "DELETE FROM "todb table";"))
+		;; (stmt6 (conc "INSERT OR REPLACE INTO "todb table" ("fields-str") SELECT "fields-str" FROM "fromdb table";"))
+		;; (stmt7 (conc "UPDATE "todb table" SET "set-str (if (member "last_update" fields)
+		;; 						     (conc " WHERE "fromdb table".last_update > "todb table".last_update;")
+		;; 						     ";")))
+
+;; prefix is "" or "auxdb."
+;;
+;; (define (dbmod:last-update-patch dbh prefix)
+;;   (let ((
+  
+;; tbls is ( ("tablename" ( "field1" [#f|proc1] ) ( "field2" [#f|proc2] ) .... ) )
+;;
+;; direction = fromdest, todest
+;; mode = 'full, 'incr
+;;
+;; Idea: youngest in dest is last_update time
+;;
+(define (dbmod:new-sync tables dbh1 dbh2 destdbfile direction #!key
+			   (mode 'full))
+  (debug:print 0 *default-log-port* "Doing new-sync "direction" "destdbfile)
+  (if (not (sqlite3:auto-committing? dbh1))
+      (debug:print 0 *default-log-port* "Skipping sync due to transaction in flight.")
+      (let* ((table-names  (map car tables))
+	     (dest-exists  (file-exists? destdbfile)))
+	(assert dest-exists "FATAL: sync called with non-existant file, "destdbfile)
+	(for-each
+	 (lambda (table)
+	   (let* ((tbldat (alist-ref table tables equal?))
+		  (fields (map car tbldat))
+		  (no-id-fields (filter (lambda (x)(not (equal? x "id"))) fields))
+		  (questionmarks    (string-intersperse (make-list (length no-id-fields) "?") ","))
+		  (fields-str       (string-intersperse fields ","))
+		  (no-id-fields-str (string-intersperse no-id-fields ","))
+		  (dir    (eq? direction 'todest))
+		  (fromdb (if dir dbh1 dbh2))
+		  (todb   (if dir dbh2 dbh1))
+		  (set-str (string-intersperse
+			    (map (lambda (field)
+				   (conc fromdb field"="todb field))
+				 fields)
+			    ","))
+		  ;; (stmt1 (conc "INSERT OR IGNORE INTO "todb table
+		  ;; 	       " SELECT * FROM "fromdb table";"))
+		  ;; (stmt8 (conc "UPDATE "todb table" SET ("no-id-fields-str") = (SELECT "no-id-fields-str" FROM "fromdb table " WHERE "todb table".id="fromdb table".id"
+		  ;; 	       (if (member "last_update" fields)
+		  ;; 		   (conc " AND "fromdb table".last_update > "todb table".last_update);")
+		  ;; 		   ");")))
+		  (stmt1    (conc "SELECT MAX(last_update) FROM "table";")) ;; use the highest last_update as your time reference
+		  (stmt2    (conc "SELECT no-id-fields-str FROM "table" WHERE last_update>?;"))
+		  (stmt3    (conc "UPDATE "table" SET ("no-id-fields-str") = ("questionmarks") WHERE id=?;"))
+		  (start-ms (current-milliseconds)))
+	     (debug:print 0 *default-log-port* "stmt3="stmt3)
+	     (if (sqlite3:auto-committing? dbh1)
+		 (begin
+		   (sqlite3:with-transaction
+		    dbh1
+		    (lambda ()
+		      (sqlite3:execute dbh1 stmt1)    ;; get all new rows
+
+		      #;(if (member "last_update" fields)
+			  (sqlite3:execute dbh1 stmt8))    ;; get all updated rows
+		      ;; (sqlite3:execute dbh stmt5)
+		      ;; (sqlite3:execute dbh stmt4) ;; if it worked this would be better for incremental up
+		      ;; (sqlite3:execute dbh stmt6)
+		      ))
+		   (debug:print 0 *default-log-port* "Synced table "table
+				" in "(- (current-milliseconds) start-ms)"ms"))
+		 (debug:print 0 *default-log-port* "Skipping sync of table "table" due to transaction in flight."))))
+	 table-names)
+	(sqlite3:execute dbh1 "DETACH auxdb;"))))
+
+
+
+
+;;======================================================================
+;; Moved from dbfile
+;;======================================================================
+
+;; wait up to aprox n seconds for a journal to go away
+;;
+(define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f))
+  (if (not (string? path))
+      (debug:print-error 0 *default-log-port* "Called tasks:wait-on-journal with path=" path " (not a string)")
+      (let ((fullpath (conc path "-journal")))
+	(handle-exceptions
+	 exn
+	 (begin
+	   (print-call-chain (current-error-port))
+	   (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+	   (debug:print 5 *default-log-port* " exn=" (condition->list exn))
+	   (debug:print 0 *default-log-port* "tasks:wait-on-journal failed. Continuing on, you can ignore this call-chain")
+	   #t) ;; if stuff goes wrong just allow it to move on
+	 (let loop ((journal-exists (file-exists? fullpath))
+		    (count          n)) ;; wait ten times ...
+	   (if journal-exists
+	       (begin
+		 (if (and waiting-msg
+			  (eq? (modulo n 30) 0))
+		     (debug:print 0 *default-log-port* waiting-msg))
+		 (if (> count 0)
+		     (begin
+		       (thread-sleep! 1)
+		       (loop (file-exists? fullpath)
+			     (- count 1)))
+		     (begin
+		       (debug:print 0 *default-log-port* "ERROR: removing the journal file " fullpath ", this is not good. Look for disk full, write access and other issues.")
+		       (if remove (system (conc "rm -rf " fullpath)))
+		       #f)))
+	       #t))))))
+
+
+;;======================================================================
+;; M E T A   G E T   A N D   S E T   V A R S
+;;======================================================================
+
+;; returns number if string->number is successful, string otherwise
+;; also updates *global-delta*
+;;
+(define (db:get-var dbstruct var)
+  (let* ((res      #f))
+    (db:with-db
+     dbstruct #f #f  ;; for the moment vars are only stored in main.db
+     (lambda (dbdat db)
+       (sqlite3:for-each-row
+        (lambda (val)
+          (set! res val))
+        db
+        "SELECT val FROM metadat WHERE var=?;" var)
+       ;; convert to number if can
+       (if (string? res)
+           (let ((valnum (string->number res)))
+             (if valnum (set! res valnum))))
+       res))))
+
+(define (db:inc-var dbstruct var)
+  (db:with-db dbstruct #f #t 
+	      (lambda (dbdat db)
+		(sqlite3:execute db "UPDATE metadat SET val=val+1 WHERE var=?;" var))))
+
+(define (db:dec-var dbstruct var)
+  (db:with-db dbstruct #f #t 
+	      (lambda (dbdat db)
+		(sqlite3:execute db "UPDATE metadat SET val=val-1 WHERE var=?;" var))))
+
+;; This was part of db:get-var. It was used to estimate the load on
+;; the database files.
+;;
+;; scale by 10, average with current value.
+;;     (set! *global-delta* (/ (+ *global-delta* (* (- (current-milliseconds) start-ms)
+;; 						 (if throttle throttle 0.01)))
+;; 			    2))
+;;     (if (> (abs (- *last-global-delta-printed* *global-delta*)) 0.08) ;; don't print all the time, only if it changes a bit
+;; 	(begin
+;; 	  (debug:print-info 4 *default-log-port* "launch throttle factor=" *global-delta*)
+;; 	  (set! *last-global-delta-printed* *global-delta*)))
+
+(define (db:set-var dbstruct var val)
+  (db:with-db dbstruct #f #t 
+	      (lambda (dbdat db)
+		(sqlite3:execute  (db:get-cache-stmth dbdat db "INSERT OR REPLACE INTO metadat (var,val) VALUES (?,?);")
+				  var val))))
+
+(define (db:add-var dbstruct var val)
+  (db:with-db dbstruct #f #t 
+	      (lambda (dbdat db)
+		(sqlite3:execute  (db:get-cache-stmth dbdat db "UPDATE metadat SET val=val+? WHERE var=?;") val var))))
+
+(define (db:del-var dbstruct var)
+  (db:with-db dbstruct #f #t 
+	      (lambda (dbdat db)
+		(sqlite3:execute  (db:get-cache-stmth dbdat db "DELETE FROM metadat WHERE var=?;") var))))
+
+(define (db:get-toplevels-and-incompletes dbstruct run-id running-deadtime remotehoststart-deadtime)
+  (let* ((toplevels   '())
+	 (oldlaunched '())
+	 (incompleted '()))
+    (db:with-db 
+     dbstruct run-id #t ;; not a write but problemtic
+     (lambda (dbdat db)
+       (let* ((stmth1 (db:get-cache-stmth
+		       dbdat db
+		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
+                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
+                                          AND state IN ('RUNNING');"))
+	      (stmth2 (db:get-cache-stmth
+		       dbdat db
+		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
+                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
+                                          AND state IN ('REMOTEHOSTSTART');"))
+	      (stmth3 (db:get-cache-stmth
+		       dbdat db
+		       "SELECT id,rundir,uname,testname,item_path FROM tests
+                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
+                                          AND state IN ('LAUNCHED');")))
+	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
+	 ;;
+	 ;; HOWEVER: this code in run:test seems to work fine
+	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
+	 ;;                     (db:test-get-run_duration testdat)))
+	 ;;                    600) 
+	 (sqlite3:for-each-row 
+	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
+	    (if (and (equal? uname "n/a")
+		     (equal? item-path "")) ;; this is a toplevel test
+		;; what to do with toplevel? call rollup?
+		(begin
+		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+		(begin
+		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
+		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
+				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
+				    " event-time="event-time" run-duration="run-duration))))
+	  stmth1
+	  run-id running-deadtime) ;; default time 720 seconds
+	 
+	 (sqlite3:for-each-row 
+	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
+	    (if (and (equal? uname "n/a")
+		     (equal? item-path "")) ;; this is a toplevel test
+		;; what to do with toplevel? call rollup?
+		(begin
+		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+		(begin
+		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
+				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
+				    " run-duration="run-duration)
+		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
+	  stmth2
+	  run-id remotehoststart-deadtime) ;; default time 230 seconds
+	 
+	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
+	 (sqlite3:for-each-row
+	  (lambda (test-id run-dir uname testname item-path)
+	    (if (and (equal? uname "n/a")
+		     (equal? item-path "")) ;; this is a toplevel test
+		;; what to do with toplevel? call rollup?
+		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+		(begin
+		  (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
+				    " 1 day since event_time marked")
+                  (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
+	  stmth3
+	  run-id))))
+    (list incompleted oldlaunched toplevels)))
+
+;;======================================================================
+;; db to db sync
+;;======================================================================
+
+(define (dbmod:db-to-db-sync src-db dest-db last-update init-proc keys)
+  (if (and (file-exists? src-db) ;; can't proceed without a source
+	   (file-read-access? src-db))
+      (let* ((have-dest     (file-exists? dest-db))
+	     (dest-file-wr  (and have-dest
+				 (file-write-access? dest-db))) ;; exists and writable
+	     (dest-dir      (or (pathname-directory dest-db)
+				"."))
+	     (dest-dir-wr   (and (file-exists? dest-dir)
+				 (file-write-access? dest-dir)))
+	     (d-wr          (or (and have-dest
+				     dest-file-wr)
+				dest-dir-wr))
+	     (copied        (if (and (not have-dest)
+				     dest-dir-wr)
+				(begin
+				  (file-copy src-db dest-db)
+				  #t)
+				#f)))
+	(if copied
+	    (begin
+	      (debug:print-info 0 *default-log-port* "db-to-db-sync done with file-copy")
+	      #t)
+	    (let* ((tables (db:sync-all-tables-list keys))
+		   (sdb    (dbmod:safely-open-db src-db init-proc #t))
+		   (ddb    (dbmod:safely-open-db dest-db init-proc d-wr))
+		   (res    (dbmod:sync-gasket tables last-update sdb ddb dest-db 'todisk keys)))
+	      (sqlite3:finalize! sdb)
+	      (sqlite3:finalize! ddb)
+	      res)))
+      #f))
 )

Index: dcommon.scm
==================================================================
--- dcommon.scm
+++ dcommon.scm
@@ -16,22 +16,27 @@
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 ;;======================================================================
 
+(declare (unit dcommon))
+
+(declare (uses gutils))
+(declare (uses db))
+(declare (uses commonmod))
+(declare (uses rmtmod))
+
 (use format)
 (require-library iup)
 (import (prefix iup iup:))
 (use canvas-draw)
 (import canvas-draw-iup)
 (use regex typed-records matchable)
 
-(declare (unit dcommon))
-
-(declare (uses gutils))
-(declare (uses db))
-;; (declare (uses synchash))
+(import commonmod
+	rmtmod
+	debugprint)
 
 (include "megatest-version.scm")
 (include "common_records.scm")
 (include "db_records.scm")
 (include "key_records.scm")
@@ -635,11 +640,12 @@
 			       (common:max (map cadr col-indices))))
 	     (max-visible  (max (- (dboard:tabdat-num-tests tabdat) 15) 3))
 	     (max-col-vis  (if (> max-col 10) 10 max-col))
 	     (numrows      1)
 	     (numcols      1))
-	(iup:attribute-set! stats-matrix "CLEARVALUE" "CONTENTS")
+	(if (common:low-noise-print 60 "runs-stats-update-clear")
+	    (iup:attribute-set! stats-matrix "CLEARVALUE" "CONTENTS"))
 	(iup:attribute-set! stats-matrix "NUMCOL" max-col )
 	(iup:attribute-set! stats-matrix "NUMLIN" (if (< max-row max-visible) max-visible max-row)) ;; min of 20
 	(iup:attribute-set! stats-matrix "NUMCOL_VISIBLE" max-col-vis)
 	(iup:attribute-set! stats-matrix "NUMLIN_VISIBLE" (if (> max-row max-visible) max-visible max-row))
 	;;(print "row-indices: " row-indices " col-indices: " col-indices)
@@ -704,11 +710,13 @@
 				     #:numlin-visible 5
 				     ))
 	 (colnames       (list "Id" "MTver" "Pid" "Host" "Interface:OutPort" "RunTime" "State" "RunId"))
 	 (updater        (lambda ()
 			   (if (dashboard:monitor-changed? commondat tabdat)
-			       (let ((servers  (server:choose-server *toppath* 'all-valid))) ;; (server:get-list *toppath* limit: 10)))
+			       (let ((servers  (case (rmt:transport-mode)
+						 ((http)(server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath* limit: 10)))
+						 (else '()))))
 				 (iup:attribute-set! servers-matrix "NUMLIN" (length servers))
 				 ;; (set! colnum 0)
 				 ;; (for-each (lambda (colname)
 				 ;;    	 ;; (print "colnum: " colnum " colname: " colname)
 				 ;;    	 (iup:attribute-set! servers-matrix (conc "0:" colnum) colname)
@@ -1418,36 +1426,10 @@
 	(begin
 	  (set! *last-monitor-update-time* run-update-time) ;; monitor-modtime)
 	  #t)
 	#f)))
 
-;; DOES NOT WORK RELIABLY WITH /tmp WAL mode files. Timestamps only change when the db
-;; is closed (I think). If db dir starts with /tmp always return true
-;;
-(define (dashboard:database-changed? commondat tabdat #!key (context-key 'default))
-  (let* ((run-update-time (current-seconds))
-	 (dbdir           (dboard:tabdat-dbdir tabdat))
-	 (modtime         (dashboard:get-youngest-run-db-mod-time dbdir))
-	 (recalc          (dashboard:recalc modtime 
-					    (dboard:commondat-please-update commondat) 
-					    (dboard:get-last-db-update tabdat context-key))))
-    ;; (dboard:tabdat-last-db-update tabdat))))
-    (if recalc 
-	(dboard:set-last-db-update! tabdat context-key run-update-time))
-    (dboard:commondat-please-update-set! commondat #f)
-    recalc))
-
-(define (dashboard:get-youngest-run-db-mod-time dbdir)
-  (handle-exceptions
-   exn
-   (begin
-     (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn)
-		  " db-dir="dbdir ", exn=" exn)
-     (current-seconds)) ;; something went wrong - just print an error and return current-seconds
-   (common:max (map (lambda (filen)
-		      (file-modification-time filen))
-		    (glob (conc dbdir "/*.db*"))))))
 
 (define (dboard:get-last-db-update tabdat context)
   (hash-table-ref/default (dboard:tabdat-last-db-update tabdat) context 0))
 
 (define (dboard:set-last-db-update! tabdat context newtime)

Index: debugprint.scm
==================================================================
--- debugprint.scm
+++ debugprint.scm
@@ -3,41 +3,49 @@
 (declare (uses mtargs))
 
 (module debugprint
 	*
 	
-;;(import scheme chicken data-structures extras files ports)
+(import scheme)
+(cond-expand
+ (chicken-4
   (import
     scheme
     chicken
     data-structures
     posix
     ports
     extras
-    
-    ;; scheme
-    ;; chicken.base
-    ;; chicken.string
-    ;; chicken.time
-    ;; chicken.time.posix
-    ;; chicken.port
-    ;; chicken.process-context
-    ;; chicken.process-context.posix
-    
     (prefix mtargs args:)
     srfi-1
     ;; system-information
-    )
+    ))
+ (chicken-5
+  (import
+    scheme
+    chicken.base
+    chicken.string
+    chicken.time
+    chicken.time.posix
+    chicken.port
+    chicken.process-context
+    chicken.process-context.posix
+
+    srfi-1
+    (prefix mtargs args:))
+
+  (define setenv set-environment-variable!)
+  ))
   
 ;;======================================================================
 ;; debug stuff
 ;;======================================================================
 
 (define verbosity (make-parameter '()))
 (define *default-log-port*  (current-error-port))
 (define debug:print-logger (make-parameter #f)) ;; set to a proc to call on every logging print
-	 
+
 (define (debug:setup)
   (let ((debugstr (or (args:get-arg "-debug")
       		      (args:get-arg "-debug-noprop")
       		      (get-environment-variable "MT_DEBUG_MODE"))))
     (verbosity (debug:calc-verbosity debugstr 'q))
@@ -45,11 +53,11 @@
     ;; if we were handed a bad verbosity rule then we will override it with 1 and continue
     (if (not (verbosity))(verbosity 1))
     (if (and (not (args:get-arg "-debug-noprop"))
       	     (or (args:get-arg "-debug")
       		 (not (get-environment-variable "MT_DEBUG_MODE"))))
-      	(setenv #;set-environment-variable! "MT_DEBUG_MODE" (if (list? (verbosity))
+      	(setenv "MT_DEBUG_MODE" (if (list? (verbosity))
       				    (string-intersperse (map conc (verbosity)) ",")
       				    (conc (verbosity)))))))
 
 ;; check verbosity, #t is ok
 (define (debug:check-verbosity verbosity vstr)
@@ -114,15 +122,15 @@
      ((and (number? vb)
 	   (list? n))
       (member vb n))
      (else #f))))
 
-(define (debug:handle-remote-logging params)
-  (if (debug:print-logger) ;; NOTE: turn params into string to keep things simple for now
-      ((debug:print-logger)(conc "REMOTE ("(get-host-name)", pid="(current-process-id)") "
-				 (string-intersperse (map conc params) " ") "; "
-				 (string-intersperse (command-line-arguments) " ")))))
+;; (define (debug:handle-remote-logging params)
+;;   (if (debug:print-logger) ;; NOTE: turn params into string to keep things simple for now
+;;       ((debug:print-logger)(conc "REMOTE ("(get-host-name)", pid="(current-process-id)") "
+;; 				 (string-intersperse (map conc params) " ") "; "
+;; 				 (string-intersperse (command-line-arguments) " ")))))
 
 (define debug:enable-timestamp (make-parameter #t))
 
 (define (debug:timestamp)
   (if (debug:enable-timestamp)

Index: diff-report.scm
==================================================================
--- diff-report.scm
+++ diff-report.scm
@@ -16,11 +16,17 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 
 (declare (unit diff-report))
 (declare (uses common))
+(declare (uses debugprint))
 (declare (uses rmt))
+(declare (uses rmtmod))
+(declare (uses commonmod))
+(import commonmod
+	rmtmod
+	debugprint)
          
 (include "common_records.scm")
 (use matchable)
 (use fmt)
 (use ducttape-lib)

ADDED   docs/reference/coping-with-the-tcp-time-wait-state-on-busy-linux-servers.pdf
Index: docs/reference/coping-with-the-tcp-time-wait-state-on-busy-linux-servers.pdf
==================================================================
--- /dev/null
+++ docs/reference/coping-with-the-tcp-time-wait-state-on-busy-linux-servers.pdf
cannot compute difference between binary files

ADDED   docs/reference/networking-increasing-the-max-number-of-tcp-ip-connections.pdf
Index: docs/reference/networking-increasing-the-max-number-of-tcp-ip-connections.pdf
==================================================================
--- /dev/null
+++ docs/reference/networking-increasing-the-max-number-of-tcp-ip-connections.pdf
cannot compute difference between binary files

ADDED   docs/reference/queues-dont-fix-overload.pdf
Index: docs/reference/queues-dont-fix-overload.pdf
==================================================================
--- /dev/null
+++ docs/reference/queues-dont-fix-overload.pdf
cannot compute difference between binary files

Index: env.scm
==================================================================
--- env.scm
+++ env.scm
@@ -17,10 +17,16 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
 (declare (unit env))
+
+(declare (uses debugprint))
+(declare (uses mtargs))
+
+(import (prefix mtargs args:)
+	debugprint)
 
 (use sql-de-lite) ;; srfi-1 posix regex regex-case srfi-69 srfi-18 call-with-environment-variables)
 
 (define (env:open-db fname)
   (let* ((db-exists (common:file-exists? fname))

Index: ezsteps.scm
==================================================================
--- ezsteps.scm
+++ ezsteps.scm
@@ -17,20 +17,27 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 
 ;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')
 
-(use srfi-1 posix regex srfi-69 directory-utils call-with-environment-variables posix-extras
-     z3 csv typed-records pathname-expand matchable)
-
 (declare (unit ezsteps))
 (declare (uses db))
 (declare (uses common))
+(declare (uses debugprint))
 (declare (uses items))
 (declare (uses runconfig))
-;; (declare (uses sdb))
-;; (declare (uses filedb))
+(declare (uses commonmod))
+(declare (uses rmtmod))
+(declare (uses mtargs))
+
+(use srfi-1 posix regex srfi-69 directory-utils call-with-environment-variables posix-extras
+     z3 csv typed-records pathname-expand matchable)
+
+(import commonmod
+	debugprint
+	rmtmod
+	(prefix mtargs args:))
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
@@ -38,11 +45,11 @@
 
 ;;(rmt:get-test-info-by-id run-id test-id) -> testdat
 
 ;; TODO: deprecate me in favor of ezsteps.scm
 ;;
-(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat)
+(define (ezsteps:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat)
   (let* ((stepname       (car ezstep))  ;; do stuff to run the step
 	 (stepinfo       (cadr ezstep))
 	;; (let ((info (cadr ezstep)))
 	;; 		   (if (proc? info) "" info)))
 	;; (stepproc       (let ((info (cadr ezstep)))
@@ -63,11 +70,12 @@
 	 (script         "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\
 	 (logpro-file    (conc stepname ".logpro"))
 	 (html-file      (conc stepname ".html"))
 	 (dat-file       (conc stepname ".dat"))
 	 (tconfig-logpro (configf:lookup testconfig "logpro" stepname))
-	 (logpro-used    (common:file-exists? logpro-file)))
+	 (logpro-used    (common:file-exists? logpro-file))
+	 (mtexepath      (common:get-megatest-exe-path)))
     (setenv "MT_STEP_NAME" stepname)
     (hash-table-set! all-steps-dat stepname `((params . ,paramparts)))
     (debug:print 0 *default-log-port* "stepparts: " stepparts ", stepparams: " stepparams
                  ", paramparts: " paramparts ", subrun: " subrun ", stepcmd: " stepcmd)
     
@@ -96,11 +104,11 @@
     
     (debug:print 4 *default-log-port* "script: " script)
     (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f)
     ;; now launch the actual process
     (call-with-environment-variables 
-     (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
+     (list (cons "PATH" mtexepath))
      (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1")
        (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 
 	      (pid #f))
 	 (let ((proc (lambda ()
 		       (set! pid (process-run "/bin/bash" (list "-c" cmd))))))

Index: genexample.scm
==================================================================
--- genexample.scm
+++ genexample.scm
@@ -17,11 +17,18 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
 (declare (unit genexample))
+(declare (uses mtargs))
+(declare (uses debugprint))
+(declare (uses rmtmod))
+
 (use posix regex matchable)
+(import (prefix mtargs args:)
+	rmtmod
+	debugprint)
 
 (include "db_records.scm")
 
 (define genexample:example-logpro
 #<<EOF

DELETED http-transport.scm
Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ /dev/null
@@ -1,785 +0,0 @@
-
-;; Copyright 2006-2012, Matthew Welland.
-;; 
-;; This file is part of Megatest.
-;; 
-;;     Megatest is free software: you can redistribute it and/or modify
-;;     it under the terms of the GNU General Public License as published by
-;;     the Free Software Foundation, either version 3 of the License, or
-;;     (at your option) any later version.
-;; 
-;;     Megatest is distributed in the hope that it will be useful,
-;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
-;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;;     GNU General Public License for more details.
-;; 
-;;     You should have received a copy of the GNU General Public License
-;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
-
-(require-extension (srfi 18) extras tcp s11n)
-
-
-(use  srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest posix-extras)
-
-(use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing)
-
-;; Configurations for server
-(tcp-buffer-size 2048)
-(max-connections 2048) 
-
-(declare (unit http-transport))
-
-(declare (uses common))
-(declare (uses db))
-(declare (uses tests))
-(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.
-(declare (uses server))
-;; (declare (uses daemon))
-(declare (uses portlogger))
-(declare (uses rmt))
-(declare (uses dbfile))
-(declare (uses commonmod))
-
-(include "common_records.scm")
-(include "db_records.scm")
-(include "js-path.scm")
-
-(import dbfile commonmod)
-
-(require-library stml)
-(define (http-transport:make-server-url hostport)
-  (if (not hostport)
-      #f
-      (conc "http://" (car hostport) ":" (cadr hostport))))
-
-(define *server-loop-heart-beat* (current-seconds))
-
-;;======================================================================
-;; S E R V E R
-;; ======================================================================
-
-;; Call this to start the actual server
-;;
-
-(define *db:process-queue-mutex* (make-mutex))
-
-(define (http-transport:run hostn)
-  ;; Configurations for server
-  (tcp-buffer-size 2048)
-  (max-connections 2048) 
-  (debug:print 2 *default-log-port* "Attempting to start the server ...")
-  (let* ((db              #f) ;;        (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily
-	 (hostname        (get-host-name))
-	 (ipaddrstr       (let ((ipstr (if (string=? "-" hostn)
-					   ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
-					   (server:get-best-guess-address hostname)
-					   #f)))
-			    (if ipstr ipstr hostn))) ;; hostname))) 
-	 (start-port      (portlogger:open-run-close portlogger:find-port))
-	 (link-tree-path  (common:get-linktree))
-	 (tmp-area        (common:get-db-tmp-area))
-	 (start-file      (conc tmp-area "/.server-start")))
-    (debug:print-info 0 *default-log-port* "portlogger recommended port: " start-port)
-    ;; set some parameters for the server
-    (root-path     (if link-tree-path 
-		       link-tree-path
-		       (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP!
-    (handle-directory spiffy-directory-listing)
-    (handle-exception (lambda (exn chain)
-			(signal (make-composite-condition
-				 (make-property-condition 
-				  'server
-				  'message "server error")))))
-
-    ;; http-transport:handle-directory) ;; simple-directory-handler)
-    ;; Setup the web server and a /ctrl interface
-    ;;
-    (vhost-map `(((* any) . ,(lambda (continue)
-			       ;; open the db on the first call 
-				 ;; This is were we set up the database connections
-			       (let* (($   (request-vars source: 'both))
-				      (dat ($ 'dat))
-				      (res #f))
-				 (cond
-				  ((equal? (uri-path (request-uri (current-request)))
-					   '(/ "api"))
-				   (send-response body:    (api:process-request *dbstruct-dbs* $) ;; the $ is the request vars proc
-						  headers: '((content-type text/plain)))
-				   (mutex-lock! *heartbeat-mutex*)
-				   (set! *db-last-access* (current-seconds))
-				   (mutex-unlock! *heartbeat-mutex*))
-				  ((equal? (uri-path (request-uri (current-request))) 
-					   '(/ ""))
-				   (send-response body: (http-transport:main-page)))
-				  ((equal? (uri-path (request-uri (current-request))) 
-					   '(/ "json_api"))
-				   (send-response body: (http-transport:main-page)))
-				  ((equal? (uri-path (request-uri (current-request))) 
-					   '(/ "runs"))
-				   (send-response body: (http-transport:main-page)))
-				  ((equal? (uri-path (request-uri (current-request))) 
-					   '(/ any))
-				   (send-response body: "hey there!\n"
-						  headers: '((content-type text/plain))))
-				  ((equal? (uri-path (request-uri (current-request))) 
-					   '(/ "hey"))
-				   (send-response body: "hey there!\n" 
-						  headers: '((content-type text/plain))))
-                                  ((equal? (uri-path (request-uri (current-request))) 
-					   '(/ "jquery3.1.0.js"))
-				   (send-response body: (http-transport:show-jquery) 
-						  headers: '((content-type application/javascript))))
-                                  ((equal? (uri-path (request-uri (current-request))) 
-					   '(/ "test_log"))
-				   (send-response body: (http-transport:html-test-log $) 
-						  headers: '((content-type text/HTML))))    
-                                  ((equal? (uri-path (request-uri (current-request))) 
-					   '(/ "dashboard"))
-				   (send-response body: (http-transport:html-dboard $) 
-						  headers: '((content-type text/HTML)))) 
-				  (else (continue))))))))
-    (handle-exceptions
-	exn
-      (debug:print 0 *default-log-port* "Failed to create file " start-file ", exn=" exn)
-      (with-output-to-file start-file (lambda ()(print (current-process-id)))))
-    (http-transport:try-start-server ipaddrstr start-port)))
-
-;; This is recursively run by http-transport:run until sucessful
-;;
-(define (http-transport:try-start-server ipaddrstr portnum)
-  (let ((config-hostname (configf:lookup *configdat* "server" "hostname"))
-	(config-use-proxy (equal? (configf:lookup *configdat* "client" "use-http_proxy") "yes")))
-    (if (not config-use-proxy)
-	(determine-proxy (constantly #f)))
-    (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname)
-    (handle-exceptions
-	exn
-	(begin
-	  (print-error-message exn)
-	  (if (< portnum 64000)
-	      (begin 
-		(debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...")
-		(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-		(debug:print 5 *default-log-port* "exn=" (condition->list exn))
-		(portlogger:open-run-close portlogger:set-failed portnum)
-		(debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port")
-		(thread-sleep! 0.1)
-		
-		;; get_next_port goes here
-		(http-transport:try-start-server ipaddrstr
-						 (portlogger:open-run-close portlogger:find-port)))
-	      (begin
-		(debug:print 0 *default-log-port* "ERROR: Tried and tried but could not start the server"))))
-      ;; any error in following steps will result in a retry
-      (set! *server-info* (list ipaddrstr portnum))
-      (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum)
-      ;; This starts the spiffy server
-      ;; NEED WAY TO SET IP TO #f TO BIND ALL
-      ;; (start-server bind-address: ipaddrstr port: portnum)
-      (if config-hostname ;; this is a hint to bind directly
-	  (start-server port: portnum) ;; bind-address: (if (equal? config-hostname "-")
-					;;		ipaddrstr
-					;;		config-hostname))
-	  (start-server port: portnum))
-      (portlogger:open-run-close portlogger:set-port portnum "released")
-      (debug:print 1 *default-log-port* "INFO: server has been stopped"))))
-
-;;======================================================================
-;; S E R V E R   U T I L I T I E S 
-;;======================================================================
-
-;;======================================================================
-;; C L I E N T S
-;;======================================================================
-
-(define *http-mutex* (make-mutex))
-
-;; NOTE: Large block of code from 32436b426188080f72fceb6894af541fbad9921e removed here
-;;       I'm pretty sure it is defunct.
-
-;; This next block all imported en-mass from the api branch
-(define *http-requests-in-progress* 0)
-(define *http-connections-next-cleanup* (current-seconds))
-
-(define (http-transport:get-time-to-cleanup)
-  (let ((res #f))
-    (mutex-lock! *http-mutex*)
-    (set! res (> (current-seconds) *http-connections-next-cleanup*))
-    (mutex-unlock! *http-mutex*)
-    res))
-
-(define (http-transport:inc-requests-count)
-  (mutex-lock! *http-mutex*)
-  (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*))
-  ;; Use this opportunity to slow things down iff there are too many requests in flight
-  (if (> *http-requests-in-progress* 5)
-      (begin
-	(debug:print-info 0 *default-log-port* "Whoa there buddy, ease up...")
-	(thread-sleep! 1)))
-  (mutex-unlock! *http-mutex*))
-
-(define (http-transport:dec-requests-count proc) 
-  (mutex-lock! *http-mutex*)
-  (proc)
-  (set! *http-requests-in-progress* (- *http-requests-in-progress* 1))
-  (mutex-unlock! *http-mutex*))
-
-(define (http-transport:dec-requests-count-and-close-all-connections)
-  (set! *http-requests-in-progress* (- *http-requests-in-progress* 1))
-  (let loop ((etime (+ (current-seconds) 5))) ;; give up in five seconds
-    (if (> *http-requests-in-progress* 0)
-	(if (> etime (current-seconds))
-	    (begin
-	      (thread-sleep! 0.05)
-	      (loop etime))
-	    (debug:print-error 0 *default-log-port* "requests still in progress after 5 seconds of waiting. I'm going to pass on cleaning up http connections"))
-	(close-all-connections!)))
-  (set! *http-connections-next-cleanup* (+ (current-seconds) 10))
-  (mutex-unlock! *http-mutex*))
-
-(define (http-transport:inc-requests-and-prep-to-close-all-connections)
-  (mutex-lock! *http-mutex*)
-  (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*)))
-
-;; Send "cmd" with json payload "params" to serverdat and receive result
-;;
-(define (http-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3)(area-dat #f))
-  (let* ((fullurl    (if (vector? serverdat)
-			 (http-transport:server-dat-get-api-req serverdat)
-			 (begin
-			   (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info")
-			   (exit 1))))
-	 (res        (vector #f "uninitialized"))
-	 (success    #t)
-	 (sparams    (db:obj->string params transport: 'http))
-	 (runremote  (or area-dat *runremote*))
-         (server-id   (if (vector? serverdat) 
-                           (http-transport:server-dat-get-server-id serverdat)
-                           (begin
-			     (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info")
-			     (exit 1)))))
-       (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) 
-
-       ;; set up the http-client here
-       (max-retry-attempts 1)
-       ;; consider all requests indempotent
-       (retry-request? (lambda (request)
-			 #f))
-       ;; send the data and get the response
-       ;; extract the needed info from the http data and 
-       ;; process and return it.
-       (let* ((send-recieve (lambda ()
-			      (mutex-lock! *http-mutex*)
-			      ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines)
-			      ;;					       ((exn http client-error) e (print e)))
-			      (set! res (vector                ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time.
-					 success
-					 (db:string->obj 
-					  (handle-exceptions
-					      exn
-					      (let ((call-chain (get-call-chain))
-						    (msg        ((condition-property-accessor 'exn 'message) exn)))
-						(set! success #f)
-                                                (if (debug:debug-mode 1)
-                                                    (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...")
-                                                    (begin
-                                                      (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".")
-                                                      (debug:print 0 *default-log-port* " message: " msg ", exn=" exn)
-                                                      (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey"))
-                                                      (debug:print 0 *default-log-port* " call-chain: " call-chain)))
-						;; what if another thread is communicating ok? Can't happen due to mutex
-						(set! *runremote* #f)
-						(set! runremote #f)
-						;; (if runremote
-						;;    (remote-conndat-set! runremote #f))
-						;; Killing associated server to allow clean retry.")
-						;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
-						(mutex-unlock! *http-mutex*)
-						;; (signal (make-composite-condition
-						;;          (make-property-condition 'commfail 'message "failed to connect to server")))
-						;; "communications failed"
-						(close-all-connections!)
-						(db:obj->string #f))
-					      (with-input-from-request ;; was dat
-					       fullurl 
-					       (list (cons 'key (or server-id   "thekey"))
-						     (cons 'cmd cmd)
-						     (cons 'params sparams))
-					       read-string))
-					  transport: 'http)
-					 0)) ;; added this speculatively
-			      ;; Shouldn't this be a call to the managed call-all-connections stuff above?
-			      ;; (close-all-connections!) ;; BUG? WHY IS THIS HERE? Are we failing to reuse connections?
-			      (mutex-unlock! *http-mutex*)
-			      ))
-	      (time-out     (lambda ()
-			      (thread-sleep! 45)
-			      (debug:print 0 *default-log-port* "WARNING: send-receive took more than 45 seconds!!")
-			      #f))
-	      (th1 (make-thread send-recieve "with-input-from-request"))
-	      (th2 (make-thread time-out     "time out")))
-	 (thread-start! th1)
-	 (thread-start! th2)
-	 (thread-join! th1)
-          (vector-set! res 0 success)
-	 (thread-terminate! th2)
-	 (if (vector? res)
-	     (if (vector-ref res 0) ;; this is the first flag or the second flag? 
-                 (let* ((res-dat (vector-ref res 1)))
-                    (if (and (string? res-dat) (string-contains res-dat "server-id mismatch"))
-                     (signal (make-composite-condition
-		          (make-property-condition 
-		       'servermismatch
-		       'message  (vector-ref res 1))))       
-		      res)) ;; this is the *inner* vector? seriously? why?
-                 (if (debug:debug-mode 11)
-                     (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it
-                       (print-call-chain (current-error-port))
-                       (debug:print-error 11 *default-log-port* "error above occured at server, res=" res) ;; " message: " ((condition-property-accessor 'exn 'message) exn))
-                       (debug:print 11 *default-log-port* " server call chain:")
-                       (pp (vector-ref res 1) (current-error-port))
-                       (signal (vector-ref res 0)))
-                     res))
-	     (signal (make-composite-condition
-		      (make-property-condition 
-		       'timeout
-		       'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))
-
-;; careful closing of connections stored in *runremote*
-;;
-(define (http-transport:close-connections #!key (area-dat #f))
-  (let* ((runremote  (or area-dat *runremote*))
-	 (server-dat (if runremote
-                         (remote-conndat runremote)
-                         #f))) ;; (hash-table-ref/default *runremote* run-id #f)))
-    (if (vector? server-dat)
-	(let ((api-dat (http-transport:server-dat-get-api-uri server-dat)))
-	  (handle-exceptions
-	    exn
-	    (begin
-	      (print-call-chain *default-log-port*)
-	      (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
-	    (close-connection! api-dat)
-            (close-idle-connections!)
-	    #t))
-	#f)))
-
-
-(define (make-http-transport:server-dat)(make-vector 6))
-(define (http-transport:server-dat-get-iface         vec)    (vector-ref  vec 0))
-(define (http-transport:server-dat-get-port          vec)    (vector-ref  vec 1))
-(define (http-transport:server-dat-get-api-uri       vec)    (vector-ref  vec 2))
-(define (http-transport:server-dat-get-api-url       vec)    (vector-ref  vec 3))
-(define (http-transport:server-dat-get-api-req       vec)    (vector-ref  vec 4))
-(define (http-transport:server-dat-get-last-access   vec)    (vector-ref  vec 5))
-;(define (http-transport:server-dat-get-socket        vec)    (vector-ref  vec 6))
-(define (http-transport:server-dat-get-server-id     vec)    (vector-ref  vec 6))
-
-(define (http-transport:server-dat-make-url vec)
-  (if (and (http-transport:server-dat-get-iface vec)
-	   (http-transport:server-dat-get-port  vec))
-      (conc "http://" 
-	    (http-transport:server-dat-get-iface vec)
-	    ":"
-	    (http-transport:server-dat-get-port  vec))
-      #f))
-
-(define (http-transport:server-dat-update-last-access vec)
-  (if (vector? vec)
-      (vector-set! vec 5 (current-seconds))
-      (begin
-	(print-call-chain (current-error-port))
-	(debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!"))))
-
-;;
-;; connect
-;;
-(define (http-transport:client-connect iface port server-id)
-  (debug:print-info 0 *default-log-port* "Connecting to client at "iface":"port", with server-id "server-id)
-  (let* ((api-url      (conc "http://" iface ":" port "/api"))
-	 (api-uri      (uri-reference (conc "http://" iface ":" port "/api")))
-	 (api-req      (make-request method: 'POST uri: api-uri))
-	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds) server-id)))
-    server-dat))
-
-
-
-
-;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
-;; used and to shutdown after sometime if it is not.
-;;
-(define (http-transport:keep-running) 
-  ;; if none running or if > 20 seconds since 
-  ;; server last used then start shutdown
-  ;; This thread waits for the server to come alive
-  (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server")
-  (let* ((servinfofile      #f)
-	 (sdat              #f)
-	 (no-sync-db        (db:open-no-sync-db))
-	 (tmp-area          (common:get-db-tmp-area))
-	 (started-file      (conc tmp-area "/.server-started"))
-	 (server-start-time (current-seconds))
-	 (server-info (let loop ((start-time (current-seconds))
-				 (changed    #t)
-				 (last-sdat  "not this"))
-                        (begin ;; let ((sdat #f))
-			  (thread-sleep! 0.01)
-			  (debug:print-info 0 *default-log-port* "Waiting for server alive signature")
-                          (mutex-lock! *heartbeat-mutex*)
-                          (set! sdat *server-info*)
-                          (mutex-unlock! *heartbeat-mutex*)
-                          (if (and sdat
-				   (not changed)
-				   (> (- (current-seconds) start-time) 2))
-			      (let* ((servinfodir (conc *toppath*"/.servinfo"))
-				     (ipaddr      (car sdat))
-				     (port        (cadr sdat))
-				     (servinf     (conc servinfodir"/"ipaddr":"port)))
-				(set! servinfofile servinf)
-				(if (not (file-exists? servinfodir))
-				    (create-directory servinfodir #t))
-				(with-output-to-file servinf
-				  (lambda ()
-				    (let* ((serv-id (server:mk-signature)))
-				      (set! *server-id* serv-id)
-				      (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id))
-				      (print "started: "(seconds->year-week/day-time (current-seconds))))))
-				(set! *on-exit-procs* (cons
-						       (lambda ()
-							 (delete-file* servinf))
-						       *on-exit-procs*))
-				;; put data about this server into a simple flat file host.port
-				(debug:print-info 0 *default-log-port* "Received server alive signature")
-                                #;(common:save-pkt `((action . alive)
-                                                   (T      . server)
-                                                   (pid    . ,(current-process-id))
-                                                   (ipaddr . ,(car sdat))
-                                                   (port   . ,(cadr sdat)))
-                                                 *configdat* #t)
-				sdat)
-                              (begin
-				(debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
-                                (sleep 4)
-				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
-				    (let* ((ipaddr  (car sdat))
-					   (port    (cadr sdat))
-					   (servinf (conc *toppath*"/.servinfo/"ipaddr":"port)))
-				      (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
-				      ;; (delete-file* servinf) ;; handled by on-exit, can be removed
-                                      #;(common:save-pkt `((action . died)
-                                                         (T      . server)
-                                                         (pid    . ,(current-process-id))
-                                                         (ipaddr . ,(car sdat))
-                                                         (port   . ,(cadr sdat))
-                                                         (msg    . "Transport died?"))
-						       *configdat* #t)
-				      (exit))
-				    (loop start-time
-					  (equal? sdat last-sdat)
-					  sdat)))))))
-	 (iface       (car server-info))
-         (port        (cadr server-info))
-         (last-access 0)
-	 (server-timeout (server:expiration-timeout))
-	 (server-going  #f)
-	 (server-log-file (args:get-arg "-log"))) ;; always set when we are a server
-
-    (handle-exceptions
-	exn
-      (debug:print 0 *default-log-port* "Failed to create " started-file ", exn=" exn)
-      (with-output-to-file started-file (lambda ()(print (current-process-id)))))
-
-    (let loop ((count         0)
-	       (server-state 'available)
-	       (bad-sync-count 0)
-	       (start-time     (current-milliseconds)))
-
-      ;; Use this opportunity to sync the tmp db to megatest.db
-      (if (not server-going) ;; *dbstruct-dbs* 
-	  (begin
-	    (debug:print 0 *default-log-port* "SERVER: dbprep")
-	    (set! *dbstruct-dbs*  (db:setup #t)) ;;  run-id)) FIXME!!!
-	    (set! server-going #t)
-	    (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine.
-	  (if (and no-sync-db
-		   (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :)
-              (begin
-		(if (common:low-noise-print 120 "sync-all-print")
-                    (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S")))
-		(db:all-db-sync *dbstruct-dbs*))))
-      
-      ;; when things go wrong we don't want to be doing the various queries too often
-      ;; so we strive to run this stuff only every four seconds or so.
-      (let* ((sync-time (- (current-milliseconds) start-time))
-	    (rem-time  (quotient (- 4000 sync-time) 1000)))
-	(if (and (<= rem-time 4)
-		 (>  rem-time 0))
-	    (thread-sleep! rem-time)))
-      
-      (if (< count 1) ;; 3x3 = 9 secs aprox
-	  (loop (+ count 1) 'running bad-sync-count (current-milliseconds)))
-      
-      ;; Check that iface and port have not changed (can happen if server port collides)
-      (mutex-lock! *heartbeat-mutex*)
-      (set! sdat *server-info*)
-      (mutex-unlock! *heartbeat-mutex*)
-      
-      (if (not (equal? sdat (list iface port)))
-	  (let ((new-iface (car sdat))
-		(new-port  (cadr sdat)))
-	    (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info")
-	    (set! iface new-iface)
-	    (set! port  new-port)
-             (if (not *server-id*)
-		 (set! *server-id* (server:mk-signature)))
-	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
-	    (flush-output *default-log-port*)))
-      
-      ;; Transfer *db-last-access* to last-access to use in checking that we are still alive
-      (mutex-lock! *heartbeat-mutex*)
-      (set! last-access *db-last-access*)
-      (mutex-unlock! *heartbeat-mutex*)
-      
-      (if (common:low-noise-print 120 (conc "server running on " iface ":" port))
-	  (begin
-             (if (not *server-id*)
-		 (set! *server-id* (server:mk-signature)))
-             (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))   
-	     (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
-	     (flush-output *default-log-port*)))
-      (if (common:low-noise-print 60 "dbstats")
-	  (begin
-	    (debug:print 0 *default-log-port* "Server stats:")
-	    (db:print-current-query-stats)))
-      (let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600)))
-	(cond
-         ((and *server-run*
-	       (> (+ last-access server-timeout)
-		  (current-seconds)))
-          (if (common:low-noise-print 120 "server continuing")
-              (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
-	      (let ((curr-time (current-seconds)))
-		(handle-exceptions
-		    exn
-		    (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn)
-		    (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter
-			     (not *server-overloaded*)
-			     (file-exists? servinfofile))
-			(change-file-times servinfofile curr-time curr-time)))
-		(if (or (common:low-noise-print 120 "start new server")
-			(> *api-process-request-count* 50)) ;; if this server is kind of busy start up another
-		    (begin
-		      (debug:print-info 0 *default-log-port* "Server is busy, parallel-api-count "*api-process-request-count*", start another if possible...")
-		      (server:kind-run *toppath*)
-		      (if (> *api-process-request-count* 100)
-			  (begin
-			    (debug:print-info 0 *default-log-port* "Server is overloaded at parallel-api-count="*api-process-request-count*", removing "servinfofile) 
-			    (delete-file* servinfofile)))))))
-          (loop 0 server-state bad-sync-count (current-milliseconds)))
-         (else
-          (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
-          (http-transport:server-shutdown port)))))))
-
-(define (http-transport:server-shutdown port)
-  (begin
-    ;;(BB> "http-transport:server-shutdown called")
-    (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
-    ;;
-    ;; start_shutdown
-    ;;
-    (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up
-    (portlogger:open-run-close portlogger:set-port port "released")
-    (thread-sleep! 1)
-
-    ;; (debug:print-info 0 *default-log-port* "Max cached queries was    " *max-cache-size*)
-    ;; (debug:print-info 0 *default-log-port* "Number of cached writes   " *number-of-writes*)
-    ;; (debug:print-info 0 *default-log-port* "Average cached write time "
-    ;; 		      (if (eq? *number-of-writes* 0)
-    ;; 			  "n/a (no writes)"
-    ;; 			  (/ *writes-total-delay*
-    ;; 			     *number-of-writes*))
-    ;; 		      " ms")
-    ;; (debug:print-info 0 *default-log-port* "Number non-cached queries "  *number-non-write-queries*)
-    ;; (debug:print-info 0 *default-log-port* "Average non-cached time   "
-    ;; 		      (if (eq? *number-non-write-queries* 0)
-    ;; 			  "n/a (no queries)"
-    ;; 			  (/ *total-non-write-delay* 
-    ;; 			     *number-non-write-queries*))
-    ;; 		      " ms")
-    
-    (db:print-current-query-stats)
-    #;(common:save-pkt `((action . exit)
-                       (T      . server)
-                       (pid    . ,(current-process-id)))
-    *configdat* #t)
-
-    ;; remove .servinfo file(s) here
-    
-    (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
-    (exit)))
-
-;; all routes though here end in exit ...
-;;
-;; start_server? 
-;;
-(define (http-transport:launch)
-  ;; check that a server start is in progress, pause or exit if so
-  (let* ((tmp-area            (common:get-db-tmp-area))
-	 (server-start        (conc tmp-area "/.server-start"))
-	 (server-started      (conc tmp-area "/.server-started"))
-	 (start-time          (common:lazy-modification-time server-start))
-	 (started-time        (common:lazy-modification-time server-started))
-	 (server-starting     (< start-time started-time)) ;; if start-time is less than started-time then a server is still starting
-	 (start-time-old      (> (- (current-seconds) start-time) 5))
-         (cleanup-proc        (lambda (msg)
-                                (let* ((serv-fname      (conc "server-" (current-process-id) "-" (get-host-name) ".log"))
-                                       (full-serv-fname (conc *toppath* "/logs/" serv-fname))
-                                       (new-serv-fname  (conc *toppath* "/logs/" "defunct-" serv-fname)))
-                                  (debug:print 0 *default-log-port* msg)
-                                  (if (common:file-exists? full-serv-fname)
-                                      (system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname))
-                                      (debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname))
-                                  (exit)))))
-    #;(if (and (not start-time-old) ;; last server start try was less than five seconds ago
-	     (not server-starting))
-	(begin
-	  (cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting")
-	  (exit)))
-    ;; lets not even bother to start if there are already three or more server files ready to go
-    #;(let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
-      (if (> num-alive 3)
-          (begin
-            (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
-            (exit))))
-    #;(common:save-pkt `((action . start)
-		       (T      . server)
-		       (pid    . ,(current-process-id)))
-		     *configdat* #t)
-    (let* ((th2 (make-thread (lambda ()
-                               (debug:print-info 0 *default-log-port* "Server run thread started")
-                               (http-transport:run 
-                                (if (args:get-arg "-server")
-                                    (args:get-arg "-server")
-                                    "-")
-                                )) "Server run"))
-           (th3 (make-thread (lambda ()
-                               (debug:print-info 0 *default-log-port* "Server monitor thread started")
-                               (http-transport:keep-running)
-                               "Keep running"))))
-      (thread-start! th2)
-      (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
-      (thread-start! th3)
-      (set! *didsomething* #t)
-      (thread-join! th2)
-      (exit))))
-
-;; (define (http-transport:server-signal-handler signum)
-;;   (signal-mask! signum)
-;;   (handle-exceptions
-;;    exn
-;;    (debug:print 0 *default-log-port* " ... exiting ...")
-;;    (let ((th1 (make-thread (lambda ()
-;; 			     (thread-sleep! 1))
-;; 			   "eat response"))
-;; 	 (th2 (make-thread (lambda ()
-;; 			     (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.")
-;; 			     (thread-sleep! 3) ;; give the flush three seconds to do it's stuff
-;; 			     (debug:print 0 *default-log-port* "       Done.")
-;; 			     (exit 4))
-;; 			   "exit on ^C timer")))
-;;      (thread-start! th2)
-;;      (thread-start! th1)
-;;      (thread-join! th2))))
-
-;;===============================================
-;; Java script
-;;===============================================
-(define (http-transport:show-jquery)
-  (let* ((data  (tests:readlines *java-script-lib*)))
-(string-join data "\n")))
-
-
-
-;;======================================================================
-;; web pages
-;;======================================================================
-
-(define (http-transport:html-test-log $)
-   (let* ((run-id ($ 'runid))
-         (test-item ($ 'testname))
-         (parts (string-split test-item ":"))
-         (test-name (car parts))
-             
-         (item-name (if (equal? (length parts) 1)
-             ""
-             (cadr parts))))
-  ;(print $) 
-(tests:get-test-log run-id test-name item-name)))
-
-
-(define (http-transport:html-dboard $)
-  (let* ((page ($ 'page))
-         (oup       (open-output-string)) 
-         (bdy "--------------------------")
-
-         (ret  (tests:dynamic-dboard page)))
-    (s:output-new  oup  ret)
-   (close-output-port oup)
-
-  (set! bdy   (get-output-string oup))
-     (conc "<h1>Dashboard</h1>" bdy "<br/> <br/> "  )))
-
-(define (http-transport:main-page)
-  (let ((linkpath (root-path)))
-    (conc "<head><h1>" (pathname-strip-directory *toppath*) "</h1></head>"
-	  "<body>"
-	  "Run area: " *toppath*
-	  "<h2>Server Stats</h2>"
-	  (http-transport:stats-table) 
-	  "<hr>"
-	  (http-transport:runs linkpath)
-	  "<hr>"
-	  ;; (http-transport:run-stats)
-	  "</body>"
-	  )))
-
-(define (http-transport:stats-table)
-  (mutex-lock! *heartbeat-mutex*)
-  (let ((res 
-	 (conc "<table>"
-	       ;; "<tr><td>Max cached queries</td>        <td>" *max-cache-size* "</td></tr>"
-	       "<tr><td>Number of cached writes</td>   <td>" *number-of-writes* "</td></tr>"
-	       "<tr><td>Average cached write time</td> <td>" (if (eq? *number-of-writes* 0)
-								 "n/a (no writes)"
-								 (/ *writes-total-delay*
-								    *number-of-writes*))
-	       " ms</td></tr>"
-	       "<tr><td>Number non-cached queries</td> <td>"  *number-non-write-queries* "</td></tr>"
-	       ;; "<tr><td>Average non-cached time</td>   <td>" (if (eq? *number-non-write-queries* 0)
-	       ;; 							 "n/a (no queries)"
-	       ;; 							 (/ *total-non-write-delay* 
-	       ;; 							    *number-non-write-queries*))
-	       " ms</td></tr>"
-	       "<tr><td>Last access</td><td>"              (seconds->time-string *db-last-access*) "</td></tr>"
-	       "</table>")))
-    (mutex-unlock! *heartbeat-mutex*)
-    res))
-
-(define (http-transport:runs linkpath)
-  (conc "<h3>Runs</h3>"
-	(string-intersperse
-	 (let ((files (map pathname-strip-directory (glob (conc linkpath "/*")))))
-	   (map (lambda (p)
-		  (conc "<a href=\"" p "\">" p "</a><br>"))
-		files))
-	 " ")))
-
-#;(define (http-transport:run-stats)
-  (let ((stats (open-run-close db:get-running-stats #f)))
-    (conc "<table>"
-	  (string-intersperse
-	   (map (lambda (stat)
-		  (conc "<tr><td>" (car stat) "</td><td>" (cadr stat) "</td></tr>"))
-		stats)
-	   " ")
-	  "</table>")))

DELETED index-tree.scm
Index: index-tree.scm
==================================================================
--- index-tree.scm
+++ /dev/null
@@ -1,59 +0,0 @@
-;;======================================================================
-;; Copyright 2006-2013, Matthew Welland.
-;; 
-;; This file is part of Megatest.
-;; 
-;;     Megatest is free software: you can redistribute it and/or modify
-;;     it under the terms of the GNU General Public License as published by
-;;     the Free Software Foundation, either version 3 of the License, or
-;;     (at your option) any later version.
-;; 
-;;     Megatest is distributed in the hope that it will be useful,
-;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
-;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;;     GNU General Public License for more details.
-;; 
-;;     You should have received a copy of the GNU General Public License
-;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
-;;
-;;======================================================================
-
-;;======================================================================
-;; Tests
-;;======================================================================
-
-(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking tcp directory-utils)
-(import (prefix sqlite3 sqlite3:))
-
-(declare (unit tests))
-(declare (uses lock-queue))
-(declare (uses db))
-(declare (uses common))
-(declare (uses items))
-(declare (uses runconfig))
-
-(include "common_records.scm")
-(include "key_records.scm")
-(include "db_records.scm")
-(include "run_records.scm")
-(include "test_records.scm")
-
-;; Populate the links tree with index.html files
-;;
-;;   - start from most recent tests and work towards oldest -OR-
-;;     start from deepest hierarchy and work way up
-;;   - look up tests in megatest.db
-;;   - cross-reference the tests to stats.db
-;;   - if newer than event_time in stats.db or not registered in stats.db regenerate
-;;   - run du and store in stats.db
-;;   - when all tests at that level done generate next level up index.html
-;; 
-;;     include in rollup html index.html:
-;;          sum of du
-;;          counts of PASS, FAIL, RUNNING, REMOTEHOSTSTART, LAUNCHED, CHECK etc.
-;;          overall status
-;;
-;;     include in test specific index.html:
-;;          host, uname, cpu graph, disk avail graph, steps, data
-;;          meta data, state, status, du
-;;          

Index: items.scm
==================================================================
--- items.scm
+++ items.scm
@@ -21,10 +21,14 @@
 ;; 		     (temperature "cool medium hot")
 ;; 		     (season      "summer winter fall spring")))
 
 (declare (unit items))
 (declare (uses common))
+(declare (uses debugprint))
+(declare (uses commonmod))
+(import commonmod
+	debugprint)
 
 (include "common_records.scm")
 
 ;; Puts out all combinations
 (define (process-itemlist hierdepth curritemkey itemlist)

Index: keys.scm
==================================================================
--- keys.scm
+++ keys.scm
@@ -19,15 +19,22 @@
  
 ;;======================================================================
 ;; Run keys, these are used to hierarchially organise tests and run areas
 ;;======================================================================
 
-(use sqlite3 srfi-1 posix regex regex-case srfi-69)
-(import (prefix sqlite3 sqlite3:))
-
 (declare (unit keys))
 (declare (uses common))
+(declare (uses debugprint))
+(declare (uses commonmod))
+(declare (uses mtargs))
+
+(use sqlite3 srfi-1 posix regex regex-case srfi-69)
+(import (prefix sqlite3 sqlite3:)
+	(prefix mtargs args:))
+
+(import commonmod
+	debugprint)
 
 (include "key_records.scm")
 (include "common_records.scm")
 
 (define (keys->keystr keys) ;; => key1,key2,key3,additiona1, ...

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -18,29 +18,43 @@
 
 ;;======================================================================
 ;; launch a task - this runs on the originating host, tests themselves
 ;;
 ;;======================================================================
+
+(declare (unit launch))
+(declare (uses subrun))
+(declare (uses common))
+(declare (uses debugprint))
+(declare (uses commonmod))
+(declare (uses configf))
+(declare (uses db))
+(declare (uses rmtmod))
+(declare (uses ezsteps))
+;; (declare (uses dbmod))
+(declare (uses dbfile))
+(declare (uses mtargs))
 
 (use regex regex-case base64 sqlite3 srfi-18 directory-utils posix-extras z3
      call-with-environment-variables csv)
 (use typed-records pathname-expand matchable)
 
-(import (prefix base64 base64:))
-(import (prefix sqlite3 sqlite3:))
-
-(declare (unit launch))
-(declare (uses subrun))
-(declare (uses common))
-(declare (uses configf))
-(declare (uses db))
-(declare (uses ezsteps))
+(import (prefix base64 base64:)
+	(prefix sqlite3 sqlite3:)
+	(prefix mtargs args:)
+)
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "megatest-fossil-hash.scm")
+
+(import commonmod
+	rmtmod
+	debugprint
+	;; dbmod
+	dbfile)
 
 ;;======================================================================
 ;; ezsteps
 ;;======================================================================
 
@@ -183,11 +197,11 @@
 			     (tal    (cdr ezstepslst))
 			     (prevstep #f))
                     (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"")
 		    ;; check exit-info (vector-ref exit-info 1)
 		    (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1)
-			(let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat))
+			(let* ((logpro-used (ezsteps:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat))
 			       (stepname    (car ezstep))
 			       (stepparms   (hash-table-ref all-steps-dat stepname)))
 			  (setenv "MT_STEP_NAME" stepname)
 			  (pp (hash-table->alist all-steps-dat))
 			  ;; if logpro-used read in the stepname.dat file
@@ -205,11 +219,11 @@
                         )
 
                         ))))))
 
 (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags)
-  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30")))
+  (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "60")))
          (start-seconds (current-seconds))
 	 (calc-minutes  (lambda ()
 			  (inexact->exact 
 			   (round 
 			    (- 
@@ -239,13 +253,13 @@
                                        (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg
                                   df
                                   #f)))
              (do-sync       (or new-cpu-load new-disk-free over-time))
 
-             (test-info   (rmt:get-test-info-by-id run-id test-id))
-             (state       (db:test-get-state test-info))
-             (status      (db:test-get-status test-info))
+             (test-info   (rmt:get-test-state-status-by-id run-id test-id))
+             (state       (car test-info));; (db:test-get-state test-info))
+             (status      (cdr test-info));; (db:test-get-status test-info))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
         ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
          ((test-get-kill-request run-id test-id)
@@ -259,11 +273,12 @@
           (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
           ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
           (set! kill-job? #f)))
 
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
-        (launch:handle-zombie-tests run-id)
+        (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
+	    (launch:handle-zombie-tests run-id))
         (when do-sync
           ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
           ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
           ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
           (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
@@ -314,11 +329,11 @@
 		      (debug:print-error 0 *default-log-port* "Nothing to kill, pid1=" pid1 ", pid2=" pid2)
 		      (tests:test-set-status! run-id test-id "KILLED"  "FAILED TO KILL" (conc (args:get-arg "-m")" "kill-reason) #f) ;; BB ADDED kill-reason -- confirm OK with Matt
 		      )))
 	      (mutex-unlock! m)
 	      ;; no point in sticking around. Exit now. But run end of run before exiting?
-        (launch:end-of-run-check run-id)
+              (launch:end-of-run-check run-id)
 	      (exit)))
 	(if (hash-table-ref/default misc-flags 'keep-going #f)
 	    (begin
 	      (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
 	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
@@ -727,10 +742,11 @@
 
 
 	      ;; for automated creation of the rollup html file this is a good place...
 	      (if (not (equal? item-path ""))
 		      (tests:summarize-items run-id test-id test-name #f))
+	      ;; BUG was this meant to be the antecnt of the if above?
 	      (tests:summarize-test run-id test-id)  ;; don't force - just update if no
               ;; Leave a .final-status file for the top level test
               (tests:save-final-status run-id test-id)
 	      (rmt:update-run-stats run-id (rmt:get-raw-run-stats run-id))) ;; end of let*
 
@@ -765,20 +781,28 @@
 ;; new
 ;; 100% COMPLETED/ (PASS,FAIL,ABORT etc.) ==> COMPLETED / X where X is same as itemized rollup
 ;; > 3 RUNNING with not test_dead do nothing (run should already be RUNNING/ na
 ;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
 ;; 0 RUNNING ==> this is actually the first condition, should not get here
-
+(define *last-rollup* 0)
 (define (launch:end-of-run-check run-id )
     (let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))  
-           (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
+           (running-cnt       (rmt:get-count-tests-running-for-run-id run-id))
            (all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
-           (current-state (rmt:get-run-state run-id))
-           (current-status (rmt:get-run-status run-id)))
-     ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
-     (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)                      
-     (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
+	   (current-state-status (rmt:get-run-state-status run-id))
+           (current-state        (car current-state-status))  ;; (rmt:get-run-state run-id))
+           (current-status       (cdr current-state-status))) ;; (rmt:get-run-status run-id)))
+      ;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing 
+      (debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
+      ;;
+      ;; TODO: add a final rollup when run is done (if there isn't one already)
+      ;;
+      (if (or (< running-cnt 3)                              ;; have only few running
+	      (> (- (current-seconds) *last-rollup*) 10))    ;; or haven't rolled up in past ten seconds
+	  (begin
+	    (rmt:set-state-status-and-roll-up-run  run-id current-state current-status)
+	    (set! *last-rollup* (current-seconds))))
      (runs:update-junit-test-reporter-xml run-id) 
      (cond 
        ((and all-test-launched (eq? not-completed-cnt 0) (equal? all-test-launched "yes" ))
                 (if (and (equal? (rmt:get-var (conc "end-of-run-" run-id)) "no") (common:simple-lock (conc "endOfRun" run-id)))
                 (begin
@@ -806,42 +830,10 @@
                  (item-path (vector-ref running-test 11)))
 			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed")
               (if (not (null? tal))
 		  (loop (car tal) (cdr tal)))))))))))
 
-;; replaced below with version that does not ssh if checking on localhost
-#;(define (launch:is-test-alive host pid)
-  (if (and host pid (not (equal? host "n/a")))
-      (let* ((cmd (conc "ssh " host " pstree -A " pid))
-	     (output (with-input-from-pipe cmd read-lines)))
-	(debug:print 2 *default-log-port* "Running " cmd " received " output)
-	(if (eq? (length output) 0)
-	   #f
-	   #t))
-      #t))
-
-;; this is a close duplicate of:
-;;    process:alist-on-host?
-;;    process:alive
-;;
-(define (launch:is-test-alive host pid)
-  (let* ((same-host (equal? host (get-host-name)))
-	 (cmd (conc 
-	       (if same-host "" (conc "ssh "host" "))
-	       "pstree -A "pid)))
-    (if (and host pid
-	     (not (equal? host "n/a")))
-	
-	(let* ((output (if same-host
-			   (with-input-from-pipe cmd read-lines)
-			   (common:generic-ssh cmd read-lines '())))) ;; (with-input-from-pipe cmd read-lines)))
-	  (debug:print 2 *default-log-port* "Running " cmd " received " output)
-	  (if (eq? (length output) 0)
-	      #f
-	      #t))
-	#t))) ;; assuming bad query is about a live test is likely not the right thing to do?
-
 (define (launch:kill-tests-if-dead run-id)
   (let* ((running-tests (rmt:get-tests-for-run run-id "%" `("RUNNING" "LAUNCHED" "REMOTEHOSTSTART") `() #f #f #f #f #f #f #f #f)))
        (let loop ((running-test (car running-tests))
 			     (tal    (cdr running-tests))
 			     (kill-cnt 0))
@@ -852,11 +844,11 @@
                  (pid  (rmt:test-get-top-process-pid run-id test-id))   
                  (event-time (vector-ref running-test 5))
                  (duration (vector-ref running-test 12))
                  (flag 0)   
                  (curr-time (current-seconds)))
-       (if (and (< (+ event-time duration 600) curr-time) (not (launch:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed
+       (if (and (< (+ event-time duration 600) curr-time) (not (commonmod:is-test-alive host pid))) ;;test has not updated duration in last 10 min then likely its not running but confirm before marking it as killed
            (begin    
 			       	(debug:print 0 *default-log-port* "test " test-name "/" item-path " needs to be killed")
               (set! flag 1) 
               (rmt:set-state-status-and-roll-up-items run-id test-name item-path "KILLREQ" "n/a" #f)))
                (if (not (null? tal))
@@ -1131,11 +1123,14 @@
 	      (setenv "MT_TESTSUITENAME" (common:get-testsuite-name)))
 	    (begin
 	      (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.")
 	      (set! *toppath* #f) ;; force it to be false so we return #f
 	      #f))
-	
+
+	;; needed by various transport and db modules
+	(dbfile:testsuite-name (common:get-testsuite-name)) ;; (get-testsuite-name *toppath* *configdat*))
+
         ;; one more attempt to cache the configs for future reading
         (let* ((cachefiles   (launch:get-cache-file-paths areapath toppath target mtconfig))
                (mtcachef     (car cachefiles))
                (rccachef     (cdr cachefiles)))
 
@@ -1160,10 +1155,22 @@
 	;; if have -append-config then read and append here
 	(let ((cfname (args:get-arg "-append-config")))
 	  (if (and cfname
 		   (file-read-access? cfname))
 	      (read-config cfname *configdat* #t))) ;; values are added to the hash, no need to do anything special.
+	;; have config at this time, this is a good place to set params based on config file settings
+	(let*  ((dbmode   (configf:lookup *configdat* "setup" "dbcache-mode"))
+		(syncmode (configf:lookup *configdat* "setup" "sync-mode")))
+	  (if dbmode
+	      (begin
+		(debug:print-info 0 *default-log-port* "Overriding dbmode to "dbmode)
+		(dbcache-mode (string->symbol dbmode))))
+	  (if syncmode
+	      (begin
+		(debug:print-info 0 *default-log-port* "Overriding syncmode to "syncmode)
+		(dbfile:sync-method (string->symbol syncmode)))))
+	
 	*toppath*)))
 
 
 (define (get-best-disk confdat testconfig)
   (let* ((disks   (or (and testconfig (hash-table-ref/default testconfig "disks" #f))
@@ -1366,13 +1373,15 @@
 		  (not (directory-exists? toptest-path)))
 	      (begin
 		(debug:print-info 2 *default-log-port* "Creating " toptest-path " and link " lnkpath)
 		(handle-exceptions
 		    exn
-		  (begin
-		    (debug:print 0 *default-log-port* "failed to create directory " toptest-path ", exn=" exn)
-		    #f)
+		  (if (directory-exists? toptest-path) ;; it was likely created in parallel
+		      #t
+		      (begin
+			(debug:print-info 0 *default-log-port* "failed to create directory " toptest-path ", exn=" exn)
+			#f))
 		 (create-directory toptest-path #t))
 		(hash-table-set! *toptest-paths* testname toptest-path)))))
 
     ;; The toptest path has been created, the link to the test in the linktree has
     ;; been created. Now, if this is an iterated test the real test dir must be created
@@ -1380,14 +1389,16 @@
 	(begin ;; (let ((lnktarget (conc lnkpath "/" item-path)))
 	  (debug:print 2 *default-log-port* "Setting up sub test run area")
 	  (debug:print 2 *default-log-port* " - creating run area in " test-path)
 	  (handle-exceptions
 	   exn
-	   (begin
-	     (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn)
-				", exiting, exn=" exn)
-	     (exit 1))
+	   (if (directory-exists? test-path)
+	       #t
+	       (begin
+		 (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn)
+				    ", exiting, exn=" exn)
+		 (exit 1)))
 	   (create-directory test-path #t))
 	  (debug:print 2 *default-log-port* 
 		       " - creating link from: " test-path "\n"
 		       "                   to: " lnktarget)
 
@@ -1442,10 +1453,11 @@
 ;; 4. remotely run the test on allocated host
 ;;    - could be ssh to host from hosts table (update regularly with load)
 ;;    - could be netbatch
 ;;      (launch-test db (cadr status) test-conf))
 (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params)
+  (assert runname "FATAL: launch-test called with no runname")
   (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex
   (let* ( ;; (lock-key        (conc "test-" test-id))
 	;; (got-lock        (let loop ((lock        (rmt:no-sync-get-lock lock-key))
 	;; 			     (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds
 	;; 		    (if (car lock)
@@ -1548,11 +1560,11 @@
       (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "LAUNCHED" #f)
       ;; (pp (hash-table->alist tconfig))
       (set! diskpath (get-best-disk *configdat* tconfig))
       (debug:print 2 *default-log-port* "best disk path = " diskpath)
       (if diskpath
-	  (let ((dat  (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
+	  (let ((dat (create-work-area run-id run-info keyvals test-id test-path diskpath test-name itemdat)))
 	    (set! work-area (car dat))
 	    (set! toptest-work-area (cadr dat))
 	    (debug:print-info 2 *default-log-port* "Using work area " work-area))
 	  (begin
 	    (set! work-area (conc test-path "/tmp_run"))
@@ -1567,11 +1579,11 @@
 					;; (list 'serverinf *server-info*)
 					#;(list 'homehost  (let* ((hhdat (server:get-homehost)))
 							   (if hhdat
 							       (car hhdat)
 							       #f)))
-					(list 'serverurl (if *runremote*
+					#;(list 'serverurl (if *runremote* ;; would like to add this back ... WORK NEEDED
 							     (remote-server-url *runremote*)
 							     #f)) ;;
 					(list 'areaname  (common:get-testsuite-name))
 					(list 'toppath   *toppath*)
 					(list 'work-area work-area)

DELETED lock-queue.scm
Index: lock-queue.scm
==================================================================
--- lock-queue.scm
+++ /dev/null
@@ -1,253 +0,0 @@
-;; Copyright 2006-2013, Matthew Welland.
-;; 
-;; This file is part of Megatest.
-;; 
-;;     Megatest is free software: you can redistribute it and/or modify
-;;     it under the terms of the GNU General Public License as published by
-;;     the Free Software Foundation, either version 3 of the License, or
-;;     (at your option) any later version.
-;; 
-;;     Megatest is distributed in the hope that it will be useful,
-;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
-;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;;     GNU General Public License for more details.
-;; 
-;;     You should have received a copy of the GNU General Public License
-;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
-;;
-
-(use (prefix sqlite3 sqlite3:) srfi-18)
-
-(declare (unit lock-queue))
-(declare (uses common))
-(declare (uses tasks))
-
-;;======================================================================
-;; attempt to prevent overlapping updates of rollup files by queueing
-;; update requests in an sqlite db
-;;======================================================================
-
-;;======================================================================
-;; db record, <vector db path-to-db>
-;;======================================================================
-
-(define (make-lock-queue:db-dat)(make-vector 3))
-(define-inline (lock-queue:db-dat-get-db        vec)    (vector-ref  vec 0))
-(define-inline (lock-queue:db-dat-get-path      vec)    (vector-ref  vec 1))
-(define-inline (lock-queue:db-dat-set-db!       vec val)(vector-set! vec 0 val))
-(define-inline (lock-queue:db-dat-set-path!     vec val)(vector-set! vec 1 val))
-
-(define (lock-queue:delete-lock-db dbdat)
-  (let ((fname (lock-queue:db-dat-get-path dbdat)))
-    (system (conc "rm -f " fname "*"))))
-
-(define (lock-queue:open-db fname #!key (count 10))
-  (let* ((actualfname (conc fname ".lockdb"))
-	 (dbexists (common:file-exists? actualfname))
-	 (db       (sqlite3:open-database actualfname))
-	 (handler  (make-busy-timeout 136000)))
-    (if dbexists
-	(vector db actualfname)
-	(begin
-	  (handle-exceptions
-	   exn
-	   (begin
-	     (thread-sleep! 10)
-	     (if (> count 0)
-		 (lock-queue:open-db fname count: (- count 1))
-		 (vector db actualfname)))
-	   (sqlite3:with-transaction
-	    db
-	    (lambda ()
-	      (sqlite3:execute 
-	       db
-	       "CREATE TABLE IF NOT EXISTS queue (
-     	         id         INTEGER PRIMARY KEY,
-                 test_id    INTEGER,
-                 start_time INTEGER,
-                 state      TEXT,
-                 CONSTRAINT queue_constraint UNIQUE (test_id));")
-	      (sqlite3:execute
-	       db
-	       "CREATE TABLE IF NOT EXISTS runlocks (
-                 id         INTEGER PRIMARY KEY,
-                 test_id    INTEGER,
-                 run_lock   TEXT,
-                 CONSTRAINT runlock_constraint UNIQUE (run_lock));"))))))
-    (sqlite3:set-busy-handler! db handler)
-    (vector db actualfname)))
-
-(define (lock-queue:set-state dbdat test-id newstate #!key (remtries 10))
-  (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200)
-  (handle-exceptions
-   exn
-   (if (> remtries 0)
-       (begin
-	 (debug:print 0 *default-log-port* "WARNING: exception on lock-queue:set-state. Trying again in 30 seconds.")
-	 (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-	 (thread-sleep! 30)
-	 (lock-queue:set-state dbdat test-id newstate remtries: (- remtries 1)))
-       (begin
-	 (debug:print-error 0 *default-log-port* " Failed to set lock state for test with id " test-id ", error: " ((condition-property-accessor 'exn 'message) exn) ", giving up.")
-	 #f))
-   (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "UPDATE queue SET state=? WHERE test_id=?;"
-		    newstate
-		    test-id)))
-
-(define (lock-queue:any-younger? dbdat mystart test-id #!key (remtries 10))
-  ;; no need to wait on journal on read only queries
-  ;; (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200)
-  (handle-exceptions
-   exn
-   (if (> remtries 0)
-       (begin
-	 (debug:print 0 *default-log-port* "WARNING: exception on lock-queue:any-younger. Removing lockdb and trying again in 5 seconds.")
-	 (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-	 (thread-sleep! 5)
-         (lock-queue:delete-lock-db dbdat)
-	 (lock-queue:any-younger? dbdat mystart test-id remtries: (- remtries 1)))
-       (begin
-	 (debug:print-error 0 *default-log-port* " Failed to find younger locks for test with id " test-id ", error: " ((condition-property-accessor 'exn 'message) exn) ", giving up.")
-	 #f))
-   (let ((res #f))
-     (sqlite3:for-each-row
-      (lambda (tid)
-	;; Actually this should not be needed as mystart cannot be simultaneously less than and test-id same as 
-	(if (not (equal? tid test-id)) 
-	    (set! res tid)))
-      (lock-queue:db-dat-get-db dbdat)
-      "SELECT test_id FROM queue WHERE start_time > ?;" mystart)
-     res)))
-
-(define (lock-queue:get-lock dbdat test-id #!key (count 10)(waiting-msg #f))
-  (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 remove: #t waiting-msg: "lock-queue:get-lock, waiting on journal")
-  (let* ((res       #f)
-	 (db        (lock-queue:db-dat-get-db dbdat))
-	 (lckqry    (sqlite3:prepare db "SELECT test_id,run_lock FROM runlocks WHERE run_lock='locked';"))
-	 (mklckqry  (sqlite3:prepare db "INSERT INTO runlocks (test_id,run_lock) VALUES (?,'locked');")))
-    (let ((result 
-	   (handle-exceptions
-	    exn
-	    (begin
-	      (debug:print 0 *default-log-port* "WARNING: failed to get queue lock. Removing lock db and returning fail") ;; Will try again in a few seconds")
-	      (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-	      (thread-sleep! 10)
-	      ;; (if (> count 0)	
-	      ;;  #f ;; (lock-queue:get-lock dbdat test-id count: (- count 1)) - give up on retries 
-	      ;; (begin ;; never recovered, remote the lock file and return #f, no lock obtained
-	      (lock-queue:delete-lock-db dbdat)
-	      #f)
-	    (sqlite3:with-transaction
-	     db
-	     (lambda ()
-	       (sqlite3:for-each-row (lambda (tid lockstate)
-				       (set! res (list tid lockstate)))
-				     lckqry)
-	       (if res
-		   (if (equal? (car res) test-id)
-		       #t ;; already have the lock
-		       #f)
-		   (begin
-		     (sqlite3:execute mklckqry test-id)
-		     ;; if no error handled then return #t for got the lock
-		     #t)))))))
-      (sqlite3:finalize! lckqry)
-      (sqlite3:finalize! mklckqry)
-      result)))
-
-(define (lock-queue:release-lock fname test-id #!key (count 10))
-  (let* ((dbdat (lock-queue:open-db fname)))
-    (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 "lock-queue:release-lock; waiting on journal")
-    (handle-exceptions
-     exn
-     (begin
-       (debug:print 0 *default-log-port* "WARNING: Failed to release queue lock. Will try again in few seconds")
-       (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-       (thread-sleep! (/ count 10))
-       (if (> count 0)
-	   (begin
-	     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat))
-	     (lock-queue:release-lock fname test-id count: (- count 1)))
-	   (let ((journal (conc fname "-journal")))
-	     ;; If we've tried ten times and failed there is a serious problem
-	     ;; try to remove the lock db and allow it to be recreated
-	     (handle-exceptions
-	      exn
-	      #f
-	      (if (common:file-exists? journal)(delete-file journal))
-	      (if (common:file-exists? fname)  (delete-file fname))
-	      #f))))
-     (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "DELETE FROM runlocks WHERE test_id=?;" test-id)
-     (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat)))))
-
-(define (lock-queue:steal-lock dbdat test-id #!key (count 10))
-  (debug:print-info 0 *default-log-port* "Attempting to steal lock at " (lock-queue:db-dat-get-path dbdat))
-  (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 "lock-queue:steal-lock; waiting on journal")
-  (handle-exceptions
-   exn
-   (begin
-     (debug:print 0 *default-log-port* "WARNING: Failed to steal queue lock. Will try again in few seconds")
-     (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-     (thread-sleep! 10)
-     (if (> count 0)
-	 (lock-queue:steal-lock dbdat test-id count: (- count 1))
-	 #f))
-   (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "DELETE FROM runlocks WHERE run_lock='locked';"))
-  (lock-queue:get-lock dbdat test-it))
-
-;; returns #f if ok to skip the task
-;; returns #t if ok to proceed with task
-;; otherwise waits
-;;
-(define (lock-queue:wait-turn fname test-id #!key (count 10)(waiting-msg #f))
-  (let* ((dbdat   (lock-queue:open-db fname))
-	 (mystart (current-seconds))
-	 (db      (lock-queue:db-dat-get-db dbdat)))
-    ;; (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 waiting-msg: "lock-queue:wait-turn; waiting on journal file")
-    (handle-exceptions
-     exn
-     (begin
-       (debug:print 0 *default-log-port* "WARNING: Failed to find out if it is ok to skip the wait queue. Will try again in few seconds")
-       (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-       (print-call-chain (current-error-port))
-       (thread-sleep! 10)
-       (if (> count 0)
-	   (begin
-	     (sqlite3:finalize! db)
-	     (lock-queue:wait-turn fname test-id count: (- count 1)))
-	   (begin
-	     (debug:print 0 *default-log-port* "Giving up calls to lock-queue:wait-turn for test-id " test-id " at path " fname ", printing call chain")
-	     (print-call-chain (current-error-port))
-	     #f)))
-     ;; wait 10 seconds and then check to see if someone is already updating the html
-     (thread-sleep! 10)
-     (if (not (lock-queue:any-younger? dbdat mystart test-id)) ;; no processing in flight, must try to start processing
-	 (begin
-	   (tasks:wait-on-journal (lock-queue:db-dat-get-path dbdat) 1200 waiting-msg: "lock-queue:wait-turn; waiting on journal file")
-	   (sqlite3:execute
-	    db
-	    "INSERT OR REPLACE INTO queue (test_id,start_time,state) VALUES (?,?,'waiting');"
-	    test-id mystart)
-	   ;; (thread-sleep! 1) ;; give other tests a chance to register
-	   (let ((result 
-		  (let loop ((younger-waiting (lock-queue:any-younger? dbdat mystart test-id)))
-		    (if younger-waiting
-			(begin
-			  ;; no need for us to wait. mark in the lock queue db as skipping
-			  ;; no point in marking anything in the queue - simply never register this
-			  ;; test as it is *covered* by a previously started update to the html file
-			  ;; (lock-queue:set-state dbdat test-id "skipping")
-			  #f) ;; let the calling process know that nothing needs to be done
-			(if (lock-queue:get-lock dbdat test-id)
-			    #t
-			    (if (> (- (current-seconds) mystart) 36000) ;; waited too long, steal the lock
-				(lock-queue:steal-lock dbdat test-id)
-				(begin
-				  (thread-sleep! 1)
-				  (loop (lock-queue:any-younger? dbdat mystart test-id)))))))))
-	     (sqlite3:finalize! db)
-	     result))))))
-	  
-            
-;; (use trace)
-;; (trace lock-queue:get-lock lock-queue:release-lock lock-queue:wait-turn lock-queue:any-younger? lock-queue:set-state)

Index: margs.scm
==================================================================
--- margs.scm
+++ margs.scm
@@ -24,13 +24,19 @@
 (define (args:get-arg arg . default)
   (if (null? default)
       (hash-table-ref/default args:arg-hash arg #f)
       (hash-table-ref/default args:arg-hash arg (car default))))
 
-(define (args:any? . args)
-  (not (null? (filter (lambda (x) x)
-		      (map args:get-arg args)))))
+;; get an arg as a number
+(define (args:get-arg-number arg . default)
+  (let* ((val-str (args:get-arg arg))
+	 (val     (if val-str (string->number val-str) #f)))
+    (if val
+	val
+	(if (null? default)
+	    #f
+	    default))))
 
 (define (args:get-arg-from ht arg . default)
   (if (null? default)
       (hash-table-ref/default ht arg #f)
       (hash-table-ref/default ht arg (car default))))
@@ -46,18 +52,22 @@
   (if (string? help)
       (print help)
       (print "Usage: " (car (argv)) " ... "))
   (exit 0))
 
- ;; one-of args defined
-(define (args:any-defined? . param)
-  (let ((res #f))
-    (for-each 
-     (lambda (arg)
-       (if (args:get-arg arg)(set! res #t)))
-     param)
-    res))
+(define (args:any-defined? . args)
+  (not (null? (filter (lambda (x) x)
+		      (map args:get-arg args)))))
+
+;;  ;; one-of args defined
+;; (define (args:any-defined? . param)
+;;   (let ((res #f))
+;;     (for-each 
+;;      (lambda (arg)
+;;        (if (args:get-arg arg)(set! res #t)))
+;;      param)
+;;     res))
 
 ;; args: 
 (define (args:get-args args params switches arg-hash num-needed)
   (let* ((numargs (length args))
 	 (adj-num-needed (if num-needed (+ num-needed 2) #f)))

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 ;; (declare (unit megatest-version))
 
-(define megatest-version 1.7009)
+(define megatest-version 1.8013)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -22,15 +22,23 @@
 ;; fake out readline usage of toplevel-command
 (define (toplevel-command . a) #f)
 
 (declare (uses common))
 ;; (declare (uses megatest-version))
-(declare (uses margs))
+;; (declare (uses margs))
+;; (declare (uses mtargs))
+;; (declare (uses mtargs.import))
+(declare (uses mtargs))
+(declare (uses mtargs.import))
+(declare (uses debugprint))
+(declare (uses debugprint.import))
+(declare (uses commonmod))
+(declare (uses commonmod.import))
+
 (declare (uses runs))
 (declare (uses launch))
 (declare (uses server))
-(declare (uses client))
 (declare (uses tests))
 (declare (uses genexample))
 ;; (declare (uses daemon))
 
 (declare (uses db))
@@ -41,39 +49,48 @@
 (declare (uses api))
 (declare (uses tasks)) ;; only used for debugging.
 (declare (uses env))
 (declare (uses diff-report))
 (declare (uses db))
-(declare (uses dbmod))
-(declare (uses dbmod.import))
-(declare (uses commonmod))
-(declare (uses commonmod.import))
 (declare (uses dbfile))
 (declare (uses dbfile.import))
+(declare (uses dbmod))
+(declare (uses dbmod.import))
+(declare (uses portlogger))
+(declare (uses portlogger.import))
+(declare (uses tcp-transportmod))
+(declare (uses tcp-transportmod.import))
+(declare (uses rmtmod))
+(declare (uses rmtmod.import))
+
 ;; (declare (uses debugprint))
 ;; (declare (uses debugprint.import))
-;; (declare (uses mtargs))
-;; (declare (uses mtargs.import))
 
 ;; (declare (uses ftail))
 ;; (import ftail)
 
-(import dbmod
+(import (prefix mtargs args:)
+        debugprint
+	dbmod
 	commonmod
-	dbfile)
+	dbfile
+	portlogger
+	tcp-transportmod
+	rmtmod
+        )
 
 (define *db* #f) ;; this is only for the repl, do not use in general!!!!
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
 (include "megatest-fossil-hash.scm")
 
-(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:)
-     readline apropos json http-client directory-utils typed-records
-     http-client srfi-18 extras format)
+(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:))
+(use readline apropos json http-client directory-utils typed-records)
+(use http-client srfi-18 extras format tcp-server tcp)
 
 ;; Added for csv stuff - will be removed
 ;;
 (use sparse-vectors)
 
@@ -80,11 +97,21 @@
 (require-library mutils)
 
 (define *usage-log-file* #f)    ;; put path to file for logging usage in this var in the ~/.megatestrc file
 (define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file
 
+;; set some parameters here - these need to be put in something that can be loaded from other
+;; executables such as dashboard and mtutil
+;;
+(include "transport-mode.scm")
 (dbfile:db-init-proc db:initialize-main-db)
+(debug:enable-timestamp #t) 
+
+
+(set! rmtmod:send-receive rmt:send-receive)
+ ;;(lambda params (apply rmt:send-receive params))) ;; make send-receive available to rmtmod via parameter
+
 
 ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file
 ;;
 (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc")))
   (if (common:file-exists? debugcontrolf)
@@ -231,10 +258,11 @@
   -ping run-id|host:port  : ping server, exit with 0 if found
   -debug N|N,M,O...       : enable debug 0-N or N and M and O ...
   -debug-noprop N|M,M,O...: enable debug but do not propagate to subprocesses via MT_DEBUG
   -config fname           : override the megatest.config file with fname
   -append-config fname    : append fname to the megatest.config file
+  -import-sexpr fname     : import a sexpr file (use -list-runs % -dumpmode sexpr to create)
 
 Utilities
   -env2file fname         : write the environment to fname.csh and fname.sh
   -envcap a               : save current variables labeled as context 'a' in file envdat.db
   -envdelta a-b           : output enviroment delta from context a to context b to -o fname
@@ -256,10 +284,11 @@
   -list-test-time	  : list time requered to complete each test in a run. It following following arguments
                             -runname <patt> -target <patt> -dumpmode <csv,json,plain-text>
   -syscheck               : do some very basic checks; write access and space in tmp, home, runs, links and 
                             is $DISPLAY valid 
   -list-waivers           : dump waivers for specified target, runname, testpatt to stdout
+  -db2db                  : sync db to db, use -from, -to for dbs, -period and -timeout for continuous sync
 
 Diff report
   -diff-rep               : generate diff report (must include -src-target, -src-runname, -target, -runname
                                                   and either -diff-email or -diff-html)
   -src-target <target>
@@ -305,11 +334,11 @@
 			":status"
 			"-status"
 			"-list-runs"
                         "-testdata-csv"
 			"-testpatt"
-                        "--modepatt"
+                        ;; "--modepatt"
                         "-modepatt"
                         "-tagexpr"
 			"-itempatt"
 			"-setlog"
 			"-set-toplog"
@@ -318,10 +347,11 @@
 			"-m"
 			"-rerun"
 
 			"-days"
 			"-rename-run"
+			"-from"
 			"-to"
 			"-dest"
                         "-source" 
                         "-time-stamp" 
 			;; values and messages
@@ -349,10 +379,13 @@
 			"-env2file"
 			"-envcap"
 			"-envdelta"
 			"-setvars"
 			"-set-state-status"
+			"-import-sexpr"
+			"-period"  ;; sync period in seconds
+			"-timeout" ;; exit sync if timeout in seconds exceeded since last change
 
                         ;; move runs stuff here
                         "-remove-keep"           
 			"-set-run-status"
 			"-age"
@@ -373,10 +406,11 @@
 			"-load"        ;; load and exectute a scheme file
 			"-section"
 			"-var"
 			"-dumpmode"
 			"-run-id"
+			"-db"
 			"-ping"
 			"-refdb2dat"
 			"-o"
 			"-log"
                         "-sync-log"
@@ -465,10 +499,11 @@
 
 			"-convert-to-norm"
 			"-convert-to-old"
 			"-import-megatest.db"
 			"-sync-to-megatest.db"
+			"-db2db"
                         "-sync-brute-force"
 			"-logging"
 			"-v" ;; verbose 2, more than normal (normal is 1)
 			"-q" ;; quiet 0, errors/warnings only
 
@@ -582,19 +617,19 @@
 
 ;; this segment will run launch:setup only if -log is not set. This is fairly safe as servers are not
 ;; manually started and thus should never be started in a non-megatest area. Thus no need to handle situation
 ;; where (launch:setup) returns #f?
 ;;
-(if (or (args:get-arg "-log")(args:get-arg "-server")) ;; redirect the log always when a server
+(if (or (args:get-arg "-log")#;(args:get-arg "-server")) ;; redirect the log always when a server
     (handle-exceptions
 	exn
-	(begin
-	  (print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
-	  )
+      (begin
+	(print "ERROR: Failed to switch to log output. " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
       (let* ((tl   (or (args:get-arg "-log")(launch:setup)))   ;; run launch:setup if -server, ensure we do NOT run launch:setup if -log specified
+	     (dbname (args:get-arg "-db"))   ;; for the server logfile name
 	     (logf (or (args:get-arg "-log") ;; use -log unless we are a server, then craft a logfile name
-		       (conc tl "/logs/server-" (current-process-id) "-" (get-host-name) ".log")))
+		       (conc tl "/logs/server-"(or dbname "unk")"-"(current-process-id) "-" (get-host-name) ".log")))
 	     (oup  (open-logfile logf)))
 	(if (not (args:get-arg "-log"))
 	    (hash-table-set! args:arg-hash "-log" logf)) ;; fake out future queries of -log
 	(debug:print-info 0 *default-log-port* "Sending log output to " logf)
 	(set! *default-log-port* oup))))
@@ -651,26 +686,13 @@
 		       (process:children #f))
 		      (original-exit exit-code)))))
 
 ;; for some switches always print the command to stderr
 ;;
-(if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
+(if (args:any-defined? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
     (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))
 
-;; some switches imply homehost. Exit here if not on homehost
-;;
-(let ((homehost-required  (list "-cleanup-db")))
-  (if (apply args:any? homehost-required)
-      (if (not (server:choose-server *toppath* 'home?))
-	  (for-each
-	   (lambda (switch)
-	     (if (args:get-arg switch)
-		 (begin
-		   (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch
-				", you can move homehost by removing the .homehost file but this will disrupt any runs in progress.")
-		   (exit 1))))
-	   homehost-required))))
 
 ;;======================================================================
 ;; Misc setup stuff
 ;;======================================================================
 
@@ -883,11 +905,13 @@
       ))
 
 (if (args:get-arg "-ping")
     (let* ((server-id     (string->number (args:get-arg "-ping"))) ;; extract run-id (i.e. no ":"
 	   (host:port     (args:get-arg "-ping")))
-      (server:ping (or server-id host:port) #f do-exit: #t)))
+      (debug:print 0 *default-log-port* "NOT YET REIMPLEMENTED FOR TCP/INMEM") ;; bug
+      (exit)))
+      ;; (server:ping (or server-id host:port) #f do-exit: #t)))
 
 ;;======================================================================
 ;; Capture, save and manipulate environments
 ;;======================================================================
 
@@ -934,13 +958,25 @@
 ;;======================================================================
 
 ;; Server? Start up here.
 ;;
 (if (args:get-arg "-server")
-    (let ((tl        (launch:setup))
-          (transport-type (string->symbol (or (args:get-arg "-transport") "http"))))
-      (server:launch 0 transport-type)
+    (let* (;; (run-id     (args:get-arg "-run-id"))
+	   (dbfname    (args:get-arg "-db"))
+	   (tl         (launch:setup))
+	   (keys       (keys:config-get-fields *configdat*)))
+      (case (rmt:transport-mode)
+	((tcp)
+	 (let* ((timeout    (server:expiration-timeout)))
+	   (debug:print 0 *default-log-port* "INFO: Running using tcp method with server timeout of "timeout)
+	   (tt-server-timeout-param timeout)
+	   (if dbfname
+	       (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)
+	       (begin
+		 (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.")
+		 (exit 1)))))
+	(else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode))))
       (set! *didsomething* #t)))
 
 ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to
 ;; a specific Megatest area. Detail are being hashed out and this may change.
 ;;
@@ -950,23 +986,31 @@
       (set! *didsomething* #t)))
 
 (if (or (args:get-arg "-list-servers")
         (args:get-arg "-kill-servers"))
     (let ((tl (launch:setup)))
+      (debug:print 0 *default-log-port* "NOT YET REIMPLEMENTED FOR TCP/INMEM") ;; BUG
+      (exit)
       (if tl ;; all roads from here exit
 	  (let* ((servers (server:choose-server *toppath* 'all-valid)) ;; (server:get-list *toppath*))
 		 (fmtstr  "~33a~22a~20a~20a~8a\n"))
-	    (format #t fmtstr "ID" "host:port" "age (hms)" "Last mod" "State")
-	    (format #t fmtstr "==" "=========" "=========" "========" "=====")
-	    (for-each ;;  ( mod-time host port start-time pid )
+            (if (not servers)
+              (begin
+                (debug:print-info 1 *default-log-port* "No servers found")
+                (exit)
+              )
+            )
+       	    (format #t fmtstr "PID" "host:port" "age (hms)" "Last mod" "State")
+	    (format #t fmtstr "===" "=========" "=========" "========" "=====")
+	    (for-each ;;  (ip-addr port? mod-time host port start-time pid )
 	     (lambda (server)
-	       (let* ((mtm (any->number (car server)))
+	       (let* ((mtm (any->number (caddr server)))
 		      (mod (if mtm (- (current-seconds) mtm) "unk"))
-		      (age (- (current-seconds)(or (any->number (list-ref server 3)) (current-seconds))))
-		      (url (conc (cadr server) ":" (caddr server)))
+		      (age (- (current-seconds)(or (any->number mtm) (current-seconds))))
 		      (pid (list-ref server 4))
-		      (alv (if (number? mod)(< mod 10) #f)))
+		      (url (conc (car server) ":" (cadr server)))
+		      (alv (if (number? mod)(< mod 360) #f)))
 		 (format #t
 			 fmtstr
 			 pid
 			 url
 			 (seconds->hr-min-sec age)
@@ -979,11 +1023,10 @@
 		       (server:kill server)))))
 	     (sort servers (lambda (a b)
 			     (let ((ma (or (any->number (car a)) 9e9))
 				   (mb (or (any->number (car b)) 9e9)))
 			       (> ma mb)))))
-	    ;; (debug:print-info 1 *default-log-port* "Done with listservers")
 	    (set! *didsomething* #t)
 	    (exit))
 	  (exit))))
       ;; must do, would have to add checks to many/all calls below
 
@@ -1383,12 +1426,11 @@
 ;; IDEA: megatest list -runname blah% ...
 ;;
 (if (or (args:get-arg "-list-runs")
 	(args:get-arg "-list-db-targets"))
     (if (launch:setup)
-	(let* (;; (dbstruct    (make-dbr:dbstruct path: *toppath* local: (args:get-arg "-local")))
-	       (runpatt     (args:get-arg "-list-runs"))
+	(let* ((runpatt     (args:get-arg "-list-runs"))
                (access-mode (db:get-access-mode))
 	       (testpatt    (common:args-get-testpatt #f))
 	       ;; (if (args:get-arg "-testpatt") 
 	       ;;  	        (args:get-arg "-testpatt") 
 	       ;;  	        "%"))
@@ -1433,10 +1475,15 @@
 				  db:test-record-fields
 				  t)))
 	       (adj-tests-spec (delete-duplicates (if tests-spec (cons "id" tests-spec) db:test-record-fields))) ;; '("id"))))
 	       (steps-spec  (alist-ref "steps" fields-spec equal?))
 	       (test-field-index (make-hash-table)))
+	  (if (and (args:get-arg "-dumpmode")
+		   (not (member (args:get-arg "-dumpmode") '("sexpr" "json" "ods" "list"))))
+	      (begin
+		(debug:print 0 *default-log-port* "ERROR: dumpmode "(args:get-arg "-dumpmode")" not recognised. Use sexpr, json, ods or list")
+		(exit)))
 	  (if (and tests-spec (not (null? tests-spec))) ;; do some validation and processing of the test-spec
 	      (let ((invalid-tests-spec (filter (lambda (x)(not (member x db:test-record-fields))) tests-spec)))
 		(if (null? invalid-tests-spec)
 		    ;; generate the lookup map test-field-name => index-number
 		    (let loop ((hed (car adj-tests-spec))
@@ -1488,11 +1535,11 @@
 			;; (mutils:hierhash-set! data (conc (db:get-value-by-header run header "id"))  targetstr runname "meta" "id"         )
 			;; (mutils:hierhash-set! data (db:get-value-by-header run header "event_time") targetstr runname "meta" "event_time" )
 			;; (mutils:hierhash-set! data (db:get-value-by-header run header "comment")    targetstr runname "meta" "comment"    )
 			;; ;; add last entry twice - seems to be a bug in hierhash?
 			;; (mutils:hierhash-set! data (db:get-value-by-header run header "comment")    targetstr runname "meta" "comment"    )
-		       (else
+		       ((#f list)
 			(if (null? runs-spec)
 			    (print "Run: " targetstr "/" runname 
 				   " status: " (db:get-value-by-header run header "state")
 				   " run-id: " run-id ", number tests: " (length tests)
 				   " event_time: " (db:get-value-by-header run header "event_time"))
@@ -1504,11 +1551,14 @@
 			       (lambda (field-name)
 				 (if (equal? field-name "target")
 				     (display (conc "target: " targetstr " "))
 				     (display (conc field-name ": " (db:get-value-by-header run header (conc field-name)) " "))))
 			       runs-spec)
-			      (newline)))))
+			      (newline))))
+		       (else
+			(debug:print 0 *default-log-port* "ERROR: dumpmode "(args:get-arg "-dumpmode")" not recognised. Use sexpr, json, ods or list")
+			))
 		       
 		     (for-each 
 		      (lambda (test)
 		      	(common:debug-handle-exceptions #f
 			 exn
@@ -1856,11 +1906,11 @@
        (lambda (target runname keys keyvals)
 	 (if (or (string-search "%" target)
 		 (string-search "%" runname)) ;; we are being asked to re-run multiple runs
 	     (let* ((run-specs (rmt:simple-get-runs runname #f #f target #f))) ;; list of simple-run records
 	       (debug:print-info 0 *default-log-port* "Pattern supplied for target or runname with "
-				 (length run-specs) " matches round. Running each in turn.")
+				 (length run-specs) " matches found. Running each in turn.")
 	       (if (null? run-specs)
 		   (debug:print 0 *default-log-port* "WARNING: No runs match target " target " and runname " runname))
 	       (for-each (lambda (spec) 
 			   (let* ((precmd     (if (args:get-arg "-precmd")(conc (args:get-arg "-precmd") " ") ""))
 				  (newcmdline (conc
@@ -2054,11 +2104,11 @@
 (if (args:get-arg "-extract-ods")
     (general-run-call
      "-extract-ods"
      "Make ods spreadsheet"
      (lambda (target runname keys keyvals)
-       (let ((dbstruct   (make-dbr:dbstruct path: *toppath* local: #t))
+       (let ((dbstruct   (make-dbr:dbstruct areapath: *toppath* local: #t))
 	     (outputfile (args:get-arg "-extract-ods"))
 	     (runspatt   (or (args:get-arg "-runname")(args:get-arg ":runname")))
 	     (pathmod    (args:get-arg "-pathmod")))
 	     ;; (keyvalalist (keys->alist keys "%")))
 	 (debug:print 2 *default-log-port* "Extract ods, outputfile: " outputfile " runspatt: " runspatt " keyvals: " keyvals)
@@ -2177,11 +2227,10 @@
 		(exit 1)))
 
 	  (if (args:get-arg "-runstep")(debug:print-info 1 *default-log-port* "Running -runstep, first change to directory " work-area))
 	  (change-directory work-area)
 	  ;; can setup as client for server mode now
-	  ;; (client:setup)
 
 	  (if (args:get-arg "-load-test-data")
 	      ;; has sub commands that are rdb:
 	      ;; DO NOT put this one into either rmt: or open-run-close
 	      (tdb:load-test-data run-id test-id))
@@ -2321,10 +2370,16 @@
     (begin
       (if (not (launch:setup))
 	  (begin
 	    (debug:print 0 *default-log-port* "Failed to setup, exiting") 
 	    (exit 1)))
+
+;;      (if (not (server:choose-server *toppath* 'home?))
+;;	  (begin
+;;	    (debug:print 0 *default-log-port* "Servers are not running on this host or no servers alive. Cannot run cleanup-db")
+;;	    (exit 1)))
+
       (let ((dbstructs (db:setup #f)))
         (common:cleanup-db dbstructs))
       (set! *didsomething* #t)))
 
 (if (args:get-arg "-mark-incompletes")
@@ -2379,11 +2434,13 @@
 (if (or (getenv "MT_RUNSCRIPT")
 	(args:get-arg "-repl")
 	(args:get-arg "-load"))
     (let* ((toppath (launch:setup))
 	   (dbstructs (if (and toppath
-                               (server:choose-server toppath 'home?))
+			       ;; NOTE: server:choose-server is starting a server
+			       ;;   either add equivalent for tcp mode or ????
+                               #;(server:choose-server toppath 'home?))
                           (db:setup #t)
                           #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
       (if *toppath*
 	  (cond
 	   ((getenv "MT_RUNSCRIPT")
@@ -2479,14 +2536,15 @@
        'adj-testids
        'old2new
        )
       (set! *didsomething* #t)))
 
-(when (args:get-arg "-sync-brute-force")
-  (launch:setup)
-  ((server:get-bruteforce-syncer (db:setup #t) persist-until-sync: #t))
-  (set! *didsomething* #t))
+(if (args:get-arg "-import-sexpr")
+    (begin
+      (launch:setup)
+      (rmt:import-sexpr (args:get-arg "-import-sexpr"))
+      (set! *didsomething* #t)))
 
 (if (args:get-arg "-sync-to-megatest.db")
     (let* ((duh      (launch:setup))
 	   (dbstruct (db:setup #t))
 	   (tmpdbpth (dbr:dbstruct-tmppath dbstruct))
@@ -2507,10 +2565,50 @@
 (if (args:get-arg "-sync-to")
     (let ((toppath (launch:setup)))
       (tasks:sync-to-postgres *configdat* (args:get-arg "-sync-to"))
       (set! *didsomething* #t)))
 
+;; use with -from and -to
+;;
+(if (args:get-arg "-db2db")
+    (let* ((duh         (launch:setup))
+	   (src-db      (args:get-arg "-from"))
+	   (dest-db     (args:get-arg "-to"))
+	   (sync-period (args:get-arg "-period"))    ;; NOT IMPLEMENTED YET
+	   (sync-timeout (args:get-arg "-timeout"))  ;; NOT IMPLEMENTED YET
+	   (lockfile    (conc dest-db".sync-lock"))
+	   (keys        (db:get-keys #f))
+	   )
+      
+      (if (and src-db dest-db)
+	  (if (file-exists? src-db)
+	      (if (file-exists? lockfile)
+		  (debug:print 0 *default-log-port* "Lock "lockfile" exists, skipping sync...")
+		  (dbfile:with-simple-file-lock
+		   lockfile
+		   (lambda ()
+		     ;;(with-output-to-file lockfile
+		     ;;  (lambda ()
+		     ;;	(print (current-process-id))))
+		     (debug:print-info 0 *default-log-port* "Attempting to sync data from "src-db" to "dest-db"...")
+		     (if #f ;; (not (file-exists? dest-db))
+			 (begin
+			   (dbfile:with-simple-file-lock
+			    (conc dest-db ".lock") ;; is the db being opened right now?
+			    (lambda ()
+			      (debug:print 0 *default-log-port* "Using copy to create "dest-db" from "src-db)
+			      (file-copy src-db dest-db))))
+			 (let ((res (dbmod:db-to-db-sync src-db dest-db 0 (dbfile:db-init-proc) keys)))
+			   (if res
+			       (debug:print-info 0 *default-log-port* "Synced " res " records from "src-db" to "dest-db)
+			       (debug:print-info 0 *default-log-port* "No sync due to permissions or other issue."))))
+		     ;; (delete-file* lockfile)
+		     )))
+	      (debug:print 0 *default-log-port* "No sync due to unreadble or non-existant source file"src-db))
+	  (debug:print 0 *default-log-port* "Usage for -db2db; -to and -from must be specified"))
+      (set! *didsomething* #t)))
+
 (if (args:get-arg "-list-test-time")
      (let* ((toppath (launch:setup))) 
      (task:get-test-times)  
      (set! *didsomething* #t)))
 

DELETED mlaunch.scm
Index: mlaunch.scm
==================================================================
--- mlaunch.scm
+++ /dev/null
@@ -1,33 +0,0 @@
-;; Copyright 2006-2014, Matthew Welland.
-;; 
-;; This file is part of Megatest.
-;; 
-;;     Megatest is free software: you can redistribute it and/or modify
-;;     it under the terms of the GNU General Public License as published by
-;;     the Free Software Foundation, either version 3 of the License, or
-;;     (at your option) any later version.
-;; 
-;;     Megatest is distributed in the hope that it will be useful,
-;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
-;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;;     GNU General Public License for more details.
-;; 
-;;     You should have received a copy of the GNU General Public License
-;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
-
-;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')
-
-;;======================================================================
-;; MLAUNCH
-;;
-;;   take jobs from the given queue and keep launching them keeping
-;;   the cpu load at the targeted level
-;;
-;;======================================================================
-
-(use sqlite3 srfi-1 posix regex regex-case srfi-69 format)
-
-(declare (unit mlaunch))
-(declare (uses db))
-(declare (uses common))
-

Index: monitor.scm
==================================================================
--- monitor.scm
+++ monitor.scm
@@ -23,10 +23,12 @@
 (declare (unit runs))
 (declare (uses db))
 (declare (uses common))
 (declare (uses items))
 (declare (uses runconfig))
+(declare (uses commonmod))
+(import commonmod)
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")

Index: mt.scm
==================================================================
--- mt.scm
+++ mt.scm
@@ -19,19 +19,23 @@
 
 (use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking (srfi 18) posix-extras directory-utils call-with-environment-variables)
 (import (prefix sqlite3 sqlite3:))
 
 (declare (unit mt))
+(declare (uses debugprint))
 (declare (uses db))
 (declare (uses common))
 (declare (uses items))
 (declare (uses runconfig))
 (declare (uses tests))
 (declare (uses server))
 (declare (uses runs))
 (declare (uses rmt))
-;; (declare (uses filedb))
+(declare (uses rmtmod))
+
+(import debugprint
+	rmtmod)
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")

Index: mtargs.scm
==================================================================
--- mtargs.scm
+++ mtargs.scm
@@ -17,7 +17,9 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
 (declare (unit mtargs))
+
+(use srfi-69)
 
 (include "mtargs/mtargs.scm")

ADDED   mtargs/mtargs.egg
Index: mtargs/mtargs.egg
==================================================================
--- /dev/null
+++ mtargs/mtargs.egg
@@ -0,0 +1,7 @@
+((license "LGPL")
+ (version 0.1)
+ (category misc)
+ (dependencies srfi-69 srfi-1)
+ (author "Matt Welland")
+ (synopsis "Primitive argument processor.")
+ (components (extension mtargs)))

Index: mtargs/mtargs.scm
==================================================================
--- mtargs/mtargs.scm
+++ mtargs/mtargs.scm
@@ -18,28 +18,45 @@
 
 (module mtargs
     (
      arg-hash
      get-arg
+     get-arg-number
      get-arg-from
-     usage
      get-args
+     usage
      print-args
      any-defined?
-     help
-     )
-
-(import scheme chicken data-structures extras posix ports files)
-(use srfi-69 srfi-1)
-
-(define arg-hash (make-hash-table))
-(define help "")
+     ) 
+
+(import scheme) ;; gives us cond-expand in chicken-4
+
+(cond-expand
+ (chicken-5
+  (import scheme (chicken base) (chicken port) (chicken file) (chicken process-context))
+  (import srfi-69 srfi-1))
+ (chicken-4
+  (import chicken posix srfi-69 srfi-1))
+ (else))
+
+(define usage (make-parameter print))
+(define arg-hash (make-hash-table))
 
 (define (get-arg arg . default)
   (if (null? default)
       (hash-table-ref/default arg-hash arg #f)
       (hash-table-ref/default arg-hash arg (car default))))
+
+;; get an arg as a number
+(define (get-arg-number arg . default)
+  (let* ((val-str (get-arg arg))
+	 (val     (if val-str (string->number val-str) #f)))
+    (if val
+	val
+	(if (null? default)
+	    #f
+	    default))))
 
 (define (any-defined? . args)
   (not (null? (filter (lambda (x) x)
 		      (map get-arg args)))))
 
@@ -48,28 +65,10 @@
 (define (get-arg-from ht arg . default)
   (if (null? default)
       (hash-table-ref/default ht arg #f)
       (hash-table-ref/default ht arg (car default))))
 
-(define (usage . args)
-  (if (> (length args) 0)
-      (apply print "ERROR: " args))
-  (if (string? help)
-      (print help)
-      (print "Usage: " (car (argv)) " ... "))
-  (exit 0))
-
- ;; one-of args defined
-(define (any-defined? . param)
-  (let ((res #f))
-    (for-each 
-     (lambda (arg)
-       (if (get-arg arg)(set! res #t)))
-     param)
-    res))
-
-;; args: 
 (define (get-args args params switches arg-hash num-needed)
   (let* ((numtargs (length args))
 	 (adj-num-needed (if num-needed (+ num-needed 2) #f)))
     (if (< numtargs (if adj-num-needed adj-num-needed 2))
 	(if (>= num-needed 1)
@@ -94,13 +93,16 @@
 	   (else
 	    (if (null? tail)(append remtargs (list arg)) ;; return the non-used args
 		(loop (car tail)(cdr tail)(append remtargs (list arg))))))))
     ))
 
-(define (print-args remtargs arg-hash)
-  (print "ARGS: " remtargs)
+(define (print-args arg-hash)
   (for-each (lambda (arg)
 	      (print "   " arg "   " (hash-table-ref/default arg-hash arg #f)))
 	    (hash-table-keys arg-hash)))
 
+(define (any-defined? . args)
+  (not (null? (filter (lambda (x) x)
+		      (map get-arg args)))))
+
 
 )

Index: mtexec.scm
==================================================================
--- mtexec.scm
+++ mtexec.scm
@@ -26,13 +26,16 @@
    srfi-19  srfi-18 extras format pkts regex regex-case
      (prefix dbi dbi:)
      )
 
 ;; (declare (uses common))
-(declare (uses margs))
+(declare (uses mtargs))
 (declare (uses configf))
 ;; (declare (uses rmt))
+(declare (uses commonmod))
+(import commonmod
+	(prefix mtargs args:))
 
 ;; (use ducttape-lib)
 (include "megatest-version.scm")
 (include "megatest-fossil-hash.scm")
 

Index: mtut.scm
==================================================================
--- mtut.scm
+++ mtut.scm
@@ -13,12 +13,21 @@
 ;;     GNU General Public License for more details.
 ;; 
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
+;
+(declare (uses common))
+(declare (uses mtargs))
+(declare (uses debugprint))
+(declare (uses configf))
+;; (declare (uses rmt))
+(declare (uses commonmod))
+(declare (uses commonmod.import))
 
-;; (include "common.scm")
+(import debugprint)
+					; (include "common.scm")
 (include "megatest-version.scm")
 
 ;; fake out readline usage of toplevel-command
 (define (toplevel-command . a) #f)
 
@@ -26,14 +35,12 @@
    srfi-19  srfi-18 extras format pkts regex regex-case
      (prefix dbi dbi:)
      (prefix sqlite3 sqlite3:)
      nanomsg)
 
-(declare (uses common))
-(declare (uses margs))
-(declare (uses configf))
-;; (declare (uses rmt))
+(import commonmod
+	(prefix mtargs args:))
 
 (use ducttape-lib)
 
 (include "megatest-fossil-hash.scm")
 

Index: newdashboard.scm
==================================================================
--- newdashboard.scm
+++ newdashboard.scm
@@ -16,10 +16,16 @@
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
+(declare (uses common))
+(declare (uses debugprint))
+(declare (uses megatest-version))
+(declare (uses mtargs))
+(declare (uses commonmod))
+
 (use format)
 
 (use (prefix iup iup:))
 
 (use canvas-draw)
@@ -26,19 +32,18 @@
 (import canvas-draw-iup)
 
 (use sql-de-lite srfi-1 posix regex regex-case srfi-69 typed-records sparse-vectors ;; defstruct
      (prefix dbi dbi:))
 
-(declare (uses common))
-(declare (uses megatest-version))
-(declare (uses margs))
+(import commonmod
+	debugprint
+	(prefix mtargs args:))
 
 ;; (declare (uses launch))
 ;; (declare (uses gutils))
 ;; (declare (uses db))
 ;; (declare (uses server))
-;; (declare (uses synchash))
 (declare (uses dcommon))
 ;; (declare (uses tree))
 ;; 
 ;; (include "common_records.scm")
 ;; (include "db_records.scm")

Index: ods.scm
==================================================================
--- ods.scm
+++ ods.scm
@@ -17,10 +17,12 @@
 ;;
 
 (use csv-xml regex)
 (declare (unit ods))
 (declare (uses common))
+(declare (uses commonmod))
+(import commonmod)
 
 (define ods:dirs
   '("Configurations2"
     "Configurations2/toolpanel"
     "Configurations2/menubar"

DELETED portlogger-example.scm
Index: portlogger-example.scm
==================================================================
--- portlogger-example.scm
+++ /dev/null
@@ -1,21 +0,0 @@
-;;  Copyright 2006-2017, Matthew Welland.
-;; 
-;; This file is part of Megatest.
-;; 
-;;     Megatest is free software: you can redistribute it and/or modify
-;;     it under the terms of the GNU General Public License as published by
-;;     the Free Software Foundation, either version 3 of the License, or
-;;     (at your option) any later version.
-;; 
-;;     Megatest is distributed in the hope that it will be useful,
-;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
-;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;;     GNU General Public License for more details.
-;; 
-;;     You should have received a copy of the GNU General Public License
-;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
-
-
-(declare (uses portlogger))
-
-(print (apply portlogger:main (cdr (argv))))

Index: portlogger.scm
==================================================================
--- portlogger.scm
+++ portlogger.scm
@@ -15,23 +15,28 @@
 ;; 
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 
-(require-extension (srfi 18) extras tcp s11n)
-
-(use sqlite3 srfi-1 posix srfi-69 hostinfo dot-locking z3)
-(import (prefix sqlite3 sqlite3:))
 
 (declare (unit portlogger))
-(declare (uses db))
+(declare (uses debugprint))
+(declare (uses dbmod))
+
+(module portlogger
+*
 
+(import scheme chicken data-structures)
+(import srfi-1 posix srfi-69 hostinfo dot-locking z3
+	(srfi 18) extras s11n)
+(import (prefix sqlite3 sqlite3:))
+(import debugprint dbmod)
 ;; lsof -i
 
 (define (portlogger:open-db fname)
   (let* ((avail    (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away
-	 (exists   (common:file-exists? fname))
+	 (exists   (file-exists? fname))
 	 (db       (if avail 
 		       (sqlite3:open-database fname)
 		       (begin
 			 (system (conc "rm -f " fname))
 			 (sqlite3:open-database fname))))
@@ -44,12 +49,11 @@
          ;;                 port INTEGER PRIMARY KEY,
          ;;                 state TEXT DEFAULT 'not-used',
          ;;                 fail_count INTEGER DEFAULT 0,
          ;;                 update_time TIMESTAMP DEFAULT (strftime('%s','now')) );"))))
     (sqlite3:set-busy-handler! db handler)
-    (db:set-sync db) ;; (sqlite3:execute db "PRAGMA synchronous = 0;")
-    ;; (if (not exists) ;; needed with IF NOT EXISTS?
+    (sqlite3:execute db "PRAGMA synchronous = 0;")
     (sqlite3:execute 
      db
      "CREATE TABLE IF NOT EXISTS ports (
             port INTEGER PRIMARY KEY,
             state TEXT DEFAULT 'not-used',
@@ -64,12 +68,12 @@
      exn
      (begin
        ;; (release-dot-lock fname)
        (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params)
        (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-       (debug:print 5 *default-log-port* "exn=" (condition->list exn))
-       (if (common:file-exists? fname)(delete-file fname)) ;; brutally get rid of it
+       ;; (debug:print 5 *default-log-port* "exn=" (condition->list exn))
+       (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it
        (print-call-chain (current-error-port)))
      (let* (;; (lock   (obtain-dot-lock fname 2 9 10))
 	    (db     (portlogger:open-db fname))
 	    (res    (apply proc db params)))
        (sqlite3:finalize! db)
@@ -122,17 +126,12 @@
        (or curr var curr))
      #f
      db
      "SELECT (port) FROM ports WHERE state='released' LIMIT 1;")))
 
-(define (portlogger:find-port db)
-  (let* ((lowport (let ((val (configf:lookup *configdat* "server" "lowport")))
-		    (if (and val 
-			     (string->number val))
-			(string->number val)
-			32768)))
-	 (portnum (or (portlogger:get-prev-used-port db)
+(define (portlogger:find-port db #!optional (lowport 32768))
+  (let* ((portnum (or (portlogger:get-prev-used-port db)
 		      (+ lowport ;; top of registered ports is 49152 but lets use ports in the registered range
 			 (random (- 64000 lowport))))))
     (handle-exceptions
      exn
      (begin
@@ -184,5 +183,6 @@
 	     ((failed)(portlogger:set-failed db (string->number (cadr args))) 'failed)))))
     (sqlite3:finalize! db)
     result))
      
 ;; (print (apply portlogger:main (cdr (argv))))
+)

Index: process.scm
==================================================================
--- process.scm
+++ process.scm
@@ -22,10 +22,13 @@
 ;; Process convience utils
 ;;======================================================================
 
 (use regex directory-utils)
 (declare (unit process))
+(declare (uses debugprint))
+
+(import debugprint)
 
 (define (process:conservative-read port)
   (let loop ((res ""))
     (if (not (eof-object? (peek-char port)))
 	(loop (conc res (read-char port)))

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -19,17 +19,28 @@
 ;;======================================================================
 
 (use format typed-records) ;; RADT => purpose of json format??
 
 (declare (unit rmt))
+(declare (uses debugprint))
 (declare (uses api))
-(declare (uses http-transport))
+(declare (uses commonmod))
 (declare (uses dbfile))
+(declare (uses dbmod))
+(declare (uses tcp-transportmod))
 (include "common_records.scm")
-;; (declare (uses rmtmod))
+(declare (uses rmtmod))
 
-(import dbfile) ;; rmtmod)
+;; used by http-transport
+(import dbfile
+	rmtmod
+	commonmod
+	debugprint
+;; 	dbmemmod
+	dbfile
+	dbmod
+	tcp-transportmod)
 
 ;;
 ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
 ;;
 
@@ -39,64 +50,55 @@
 
 ;;======================================================================
 ;;  S U P P O R T   F U N C T I O N S
 ;;======================================================================
 
-;; if a server is either running or in the process of starting call client:setup
-;; else return #f to let the calling proc know that there is no server available
-;;
-(define (rmt:get-connection-info areapath #!key (area-dat #f)) ;; TODO: push areapath down.
-  (let* ((runremote (or area-dat *runremote*))
-	 (cinfo     (if (remote? runremote)
-			(remote-conndat runremote)
-			#f)))
-	  (if cinfo
-	      cinfo
-	      (if (server:check-if-running areapath)
-		  (client:setup areapath)
-		  #f))))
-
 (define (rmt:on-homehost? runremote)
   (let* ((hh-dat (remote-hh-dat runremote)))
     (if (pair? hh-dat)
 	(cdr hh-dat)
 	(begin
 	  (debug:print-info 0 *default-log-port* "hh-dat="hh-dat)
 	  #f))))
 
+(define (make-and-init-remote areapath)
+   (case (rmt:transport-mode)
+     ((http)(make-remote))
+     ((tcp) (tt:make-remote areapath))
+     (else #f)))
 
 ;;======================================================================
 
 (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
 
 ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
 ;;
 (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
-
-  #;(common:telemetry-log (conc "rmt:"(->string cmd))
-                        payload: `((rid . ,rid)
-                                   (params . ,params)))
-
-  (if (> attemptnum 2)
-      (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
-    
-  (cond
-   ((> attemptnum 2) (thread-sleep! 0.05))
-   ((> attemptnum 10) (thread-sleep! 0.5))
-   ((> attemptnum 20) (thread-sleep! 1)))
-  (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
-    (begin (server:run *toppath*) (thread-sleep! 3))) 
-  
-  
-  ;;DOT digraph megatest_state_status {
-  ;;DOT   ranksep=0;
-  ;;DOT   // rankdir=LR;
-  ;;DOT   node [shape="box"];
-  ;;DOT "rmt:send-receive" -> MUTEXLOCK;
-  ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; }
-  ;; do all the prep locked under the rmt-mutex
-  (mutex-lock! *rmt-mutex*)
+  (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
+
+  (if (not (eq? (rmt:transport-mode) 'nfs))
+      (begin
+	(if (> attemptnum 2)
+	    (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
+	
+	(cond
+	 ((> attemptnum 2) (thread-sleep! 0.05))
+	 ((> attemptnum 10) (thread-sleep! 0.5))
+	 ((> attemptnum 20) (thread-sleep! 1)))
+	
+	;; I'm turning this off, it may make sense to move it
+	;; into http-transport-handler
+	(if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
+	    (begin
+	      (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.")
+	      (case (rmt:transport-mode)
+		((http)
+		 (server:run *toppath*)
+		 (thread-sleep! 3))
+		(else
+		 (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server
+		 ))))))
   
   ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
   ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
   ;; 3. do the query, if on homehost use local access
   ;;
@@ -103,259 +105,33 @@
   (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
          (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
 	 (runremote     (or area-dat
 			    *runremote*))
          (attemptnum    (+ 1 attemptnum))
-	 (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*)))
-
-    ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity
-    ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"];
-    ;; DOT INIT_RUNREMOTE -> MUTEXLOCK;
-    ;; ensure we have a record for our connection for given area
-    (if (not runremote)                   ;; can remove this one. should never get here.         
-	(begin
-	  (set! *runremote* (make-remote))
-          (let* ((server-info (remote-server-info *runremote*))) 
-            (if server-info
-		(begin
-			(remote-server-url-set! *runremote* (server:record->url server-info))
-			(remote-server-id-set! *runremote* (server:record->id server-info)))))  
-	  (set! runremote   *runremote*))) ;; new runremote will come from this on next iteration
-    
-    ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
-    ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
-    ;; DOT SET_HOMEHOST -> MUTEXLOCK;
-    ;; ensure we have a homehost record
-    (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
-	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
-	(let ((hh-data (server:choose-server areapath 'homehost)))
-	  (remote-hh-dat-set! runremote (or hh-data (cons #f #f)))))
-    
-    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
-    (cond
-     #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
-      (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")
-      (set! *runremote* #f)
-      ;; BUG: close-connections should go here?
-      (mutex-unlock! *rmt-mutex*)
-      (rmt:send-receive cmd rid params attemptnum: 1 area-dat: area-dat))
-     
-     ;;DOT EXIT;
-     ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" }
-     ;; give up if more than 150 attempts
-     ((> attemptnum 150)
-      (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.")
-      (exit 1))
-
-     ;;DOT CASE2 [label="local\nreadonly\nquery"];
-     ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2}
-     ;;DOT CASE2 -> "rmt:open-qry-close-locally";
-     ;; readonly mode, read request-  handle it - case 2
-     ((and readonly-mode
-           (member cmd api:read-only-queries)) 
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2")
-      (rmt:open-qry-close-locally cmd 0 params)
-      )
-
-     ;;DOT CASE3 [label="write in\nread-only mode"];
-     ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3}
-     ;;DOT CASE3 -> "#f";
-     ;; readonly mode, write request.  Do nothing, return #f
-     (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params))
-
-     ;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
-     ;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
-     ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
-     ;;
-     ;;DOT CASE4 [label="reset\nconnection"];
-     ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
-     ;;DOT CASE4 -> "rmt:send-receive";
-     ;; reset the connection if it has been unused too long
-     ((and runremote
-           (remote-conndat runremote)
-	   (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
-	      (+ (http-transport:server-dat-get-last-access (remote-conndat runremote))
-		 (remote-server-timeout runremote))))
-      (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
-      (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
-      (http-transport:close-connections area-dat: runremote)
-      (mutex-unlock! *rmt-mutex*)
-      (rmt:send-receive cmd rid params attemptnum: attemptnum))
-     
-     ;;DOT CASE5 [label="local\nread"];
-     ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
-     ;;DOT CASE5 -> "rmt:open-qry-close-locally";
-
-     ;; on homehost and this is a read
-     ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
-	   (rmt:on-homehost? runremote)
-           (member cmd api:read-only-queries))   ;; this is a read
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  5")
-      (rmt:open-qry-close-locally cmd 0 params))
-
-     ;;DOT CASE6 [label="init\nremote"];
-     ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6};
-     ;;DOT CASE6 -> "rmt:send-receive";
-     ;; on homehost and this is a write, we already have a server, but server has died
-
-     ;; reinstate this keep-alive section but inject a time condition into the (add ...
-     
-     #;((and (cdr (remote-hh-dat runremote))           ;; on homehost
-           (not (member cmd api:read-only-queries))  ;; this is a write
-           (remote-server-url runremote)             ;; have a server
-           (not (server:ping (remote-server-url runremote) (remote-server-id runremote))))  ;; server has died. NOTE: this is not a cheap call! Need better approach.
-      (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6")
-      (http-transport:close-connections area-dat: runremote) ;; make sure to clean up
-      (set! *runremote* (make-remote))
-      (let* ((server-info (remote-server-info *runremote*))) 
-            (if server-info
-		(begin
-		  (remote-server-url-set! *runremote* (server:record->url server-info))
-                  (remote-server-id-set! *runremote* (server:record->id server-info)))))
-      (remote-force-server-set! runremote (common:force-server?))
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6")
-      (rmt:send-receive cmd rid params attemptnum: attemptnum))
-
-     ;;DOT CASE7 [label="homehost\nwrite"];
-     ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7};
-     ;;DOT CASE7 -> "rmt:open-qry-close-locally";
-     ;; on homehost and this is a write, we already have a server
-     ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
-	   (cdr (remote-hh-dat runremote))           ;; on homehost
-           (not (member cmd api:read-only-queries))  ;; this is a write
-           (remote-server-url runremote))            ;; have a server
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  4.1")
-      (rmt:open-qry-close-locally cmd 0 params))
-
-     ;;DOT CASE8 [label="force\nserver"];
-     ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8};
-     ;;DOT CASE8 -> "rmt:open-qry-close-locally";
-     ;;  on homehost, no server contact made and this is a write, passively start a server 
-     ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
-	   (cdr (remote-hh-dat runremote))           ;; have homehost
-           (not (remote-server-url runremote))       ;; no connection yet
-	   (not (member cmd api:read-only-queries))) ;; not a read-only query
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
-      (let ((server-info  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
-	(if server-info
-	    (begin
-              (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed
-              (remote-server-id-set! runremote (server:record->id server-info)))  
-	    (if (common:force-server?)
-		(server:start-and-wait *toppath*)
-		(server:kind-run *toppath*)))
-      (remote-force-server-set! runremote (common:force-server?))
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8.1")
-      (rmt:open-qry-close-locally cmd 0 params)))
-
-     ;;DOT CASE9 [label="force server\nnot on homehost"];
-     ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9};
-     ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive";
-     ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
-	       (not (remote-conndat runremote)))
-	  (and (not (cdr (remote-hh-dat runremote)))        ;; not on a homehost 
-	       (not (remote-conndat runremote))))           ;; and no connection
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
-      (mutex-unlock! *rmt-mutex*)
-      (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
-	  (server:start-and-wait *toppath*))
-      (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
-      (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
-
-     ;;DOT CASE10 [label="on homehost"];
-     ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
-     ;;DOT CASE10 -> "rmt:open-qry-close-locally";
-     ;; all set up if get this far, dispatch the query
-     ((and (not (remote-force-server runremote))
-	   (cdr (remote-hh-dat runremote))) ;; we are on homehost
-      (mutex-unlock! *rmt-mutex*)
-      (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10")
-      (rmt:open-qry-close-locally cmd (if rid rid 0) params))
-
-     ;;DOT CASE11 [label="send_receive"];
-     ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11};
-     ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"];
-     ;;DOT CASE11 -> "RESULT" [label="call succeeded"];
-     ;; not on homehost, do server query
-     (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)))))
-    ;;DOT }
-
-;; No Title 
-;; Error: (vector-ref) out of range
-;; #(#<condition: (exn type)> (#("db.scm:3740: regex#regexp" #f #f) #("db.scm:3739: regex#string-substitute" #f #f) #("db.scm:3738: base64#base64-decode" #f #f) #("db.scm:3737: z3#z3:decode-buffer" #f #f) #("db.scm:3736: with-input-from-string" #f #f) #("db.scm:3741: s11n#deserialize" #f #f) #("api.scm:374: api:execute-requests" #f #f) #("api.scm:139: call-with-current-continuation" #f #f) #("api.scm:139: with-exception-handler" #f #f) #("api.scm:139: ##sys#call-with-values" #f #f) #("api.scm:158: string->symbol" #f #f) #("api.scm:160: current-milliseconds" #f #f) #("api.scm:161: dbr:dbstruct-read-only" #f #f) #("api.scm:139: k15" #f #f) #("api.scm:139: g19" #f #f) #("api.scm:142: get-call-chain" #f #f)) #("get-test-info-by-id" (1102 507299)))
-;; 6
-;; 
-;; 	Call history:
-;; 
-;; 	http-transport.scm:306: thread-terminate!	  
-;; 	http-transport.scm:307: debug:print-info	  
-;; 	common_records.scm:235: debug:debug-mode	  
-;; 	rmt.scm:259: k587	  
-;; 	rmt.scm:259: g591	  
-;; 	rmt.scm:276: http-transport:server-dat-update-last-access	  
-;; 	http-transport.scm:364: current-seconds	  
-;; 	rmt.scm:282: debug:print-info	  
-;; 	common_records.scm:235: debug:debug-mode	  
-;; 	rmt.scm:283: mutex-unlock!	  
-;; 	rmt.scm:287: extras-transport-succeded	  	<--
-;; +-----------------------------------------------------------------------------+
-;; | Exit Status    : 70  
-;;  
-
-;; bunch of small functions factored out of send-receive to make debug easier
-;;
-
-(define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)
-  ;; (mutex-unlock! *rmt-mutex*)
-  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9")
-  ;; (mutex-lock! *rmt-mutex*)
-  (let* ((conninfo (remote-conndat runremote))
-	 (dat-in      (case (remote-transport runremote)
-		     ((http) (condition-case ;; handling here has
-					     ;; caused a lot of
-					     ;; problems. However it
-					     ;; is needed to deal with
-					     ;; attemtped
-					     ;; communication to
-					     ;; servers that have gone
-					     ;; away
-			      (http-transport:client-api-send-receive 0 conninfo cmd params)
-                              ((servermismatch)  (vector #f "Server id mismatch" ))
-			      ((commfail)(vector #f "communications fail"))
-			      ((exn)(vector #f "other fail" (print-call-chain)))))
-		     (else
-		      (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported")
-		      (exit))))
-	 (dat      (if (and (vector? dat-in) ;; ... check it is a correct size
-			    (> (vector-length dat-in) 1))
-		       dat-in
-		       (vector #f (conc "communications fail (type 2), dat-in=" dat-in))))
-	 (success  (if (vector? dat) (vector-ref dat 0) #f))
-	 (res      (if (vector? dat) (vector-ref dat 1) #f)))
-    (if (and (vector? conninfo) (< 5 (vector-length conninfo)))
-	(http-transport:server-dat-update-last-access conninfo) ;; refresh access time
-	(begin
-	  (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo)
-	  (set! conninfo #f)
-	  (remote-conndat-set! *runremote* #f) ;; NOTE: *runremote* is global copy of runremote. Purpose: factor out global.
-	  (http-transport:close-connections  area-dat: runremote)))
-    (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = " runremote)
-    (mutex-unlock! *rmt-mutex*)
-    (if success ;; success only tells us that the transport was
-	;; successful, have to examine the data to see if
-	;; there was a detected issue at the other end
-	(extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd)
-	(begin
-           (debug:print-error 0 *default-log-port* " dat=" dat) 
-           (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params))
-	)))
-
+	 (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))
+	 (testsuite     (common:get-testsuite-name))
+	 (mtexe         (common:find-local-megatest)))
+
+    (case (rmt:transport-mode)
+      ((http)(http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode))
+      ((tcp) (tcp-transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
+      ((nfs) (nfs-transport-handler  runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
+      )))
+
+(define (nfs-transport-handler  runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
+  (let* ((keys     (common:get-fields *configdat*))
+	 (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath)))
+    (api:dispatch-request dbstruct cmd run-id params)))
+	 
+(define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
+  (if (not runremote)
+      (let* ((newremote  (make-and-init-remote areapath)))
+	(set! *runremote* newremote)
+	(set! runremote newremote)))
+  (let* ((dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) ;;(dbfile:run-id->path areapath run-id)))
+    (tt:handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))
+	
 (define (rmt:print-db-stats)
   (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
     (debug:print 18 *default-log-port* "DB Stats\n========")
     (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
     (for-each (lambda (cmd)
@@ -392,11 +168,11 @@
     res))
 
 (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
   (let* ((qry-is-write    (not (member cmd api:read-only-queries)))
 	 (db-file-path    (db:dbfile-path)) ;;  0))
-	 (dbstructs-local (db:setup #t))  ;; make-dbr:dbstruct path:  dbdir local: #t)))
+	 (dbstructs-local (db:setup #t))
 	 (read-only       (not (file-write-access? db-file-path)))
 	 (start           (current-milliseconds))
 	 (resdat          (if (not (and read-only qry-is-write))
 			      (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params))))
 			;;	(handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong..
@@ -432,22 +208,10 @@
 		(mutex-lock! *db-multi-sync-mutex*)
 /		(set! *db-last-access* start-time)  ;; THIS IS PROBABLY USELESS? (we are on a client)
                 (mutex-unlock! *db-multi-sync-mutex*)))))
     res))
 
-(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
-  (let* ((run-id   (if run-id run-id 0))
-	 (res  	   ;; (handle-exceptions
-		   ;;     exn
-		   ;;   (begin
-		   ;;     (print "transport failed. exn=" exn)
-		   ;;     #f)
-		     (http-transport:client-api-send-receive run-id connection-info cmd params))) ;; )
-    (if (and res (vector-ref res 0))
-	(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
-	#f)))
-
 ;;======================================================================
 ;;
 ;; A C T U A L   A P I   C A L L S  
 ;;
 ;;======================================================================
@@ -470,21 +234,12 @@
   (rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature))))
 
 ;; This login does no retries under the hood - it acts a bit like a ping.
 ;; Deprecated for nmsg-transport.
 ;;
-(define (rmt:login-no-auto-client-setup connection-info)
-  (case *transport-type* ;; run-id of 0 is just a placeholder
-    ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version (client:get-signature))))
-    ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*)))
-    ))
-
-;; hand off a call to one of the db:queries statements
-;; added run-id to make looking up the correct db possible 
-;;
-(define (rmt:general-call stmtname run-id . params)
-  (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params)))
+;; (define (rmt:login-no-auto-client-setup runremote)
+;;   (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature))))
 
 
 ;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host
 (define (rmt:get-latest-host-load hostname)
   (rmt:send-receive 'get-latest-host-load 0 (list hostname)))
@@ -519,13 +274,14 @@
 ;;======================================================================
 ;;  K E Y S 
 ;;======================================================================
 
 ;; These require run-id because the values come from the run!
+;; however the query must still apply to main.db
 ;;
 (define (rmt:get-key-val-pairs run-id)
-  (rmt:send-receive 'get-key-val-pairs run-id (list run-id)))
+  (rmt:send-receive 'get-key-val-pairs #f (list run-id)))
 
 (define (rmt:get-keys)
   (if *db-keys* *db-keys* 
      (let ((res (rmt:send-receive 'get-keys #f '())))
        (set! *db-keys* res)
@@ -548,81 +304,20 @@
 (define (rmt:get-targets)
   (rmt:send-receive 'get-targets #f '()))
 
 (define (rmt:get-target run-id)
   (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:send-receive 'get-target run-id (list run-id)))
+  (rmt:send-receive 'get-target #f (list run-id)))
 
 (define (rmt:get-run-times runpatt targetpatt)
   (rmt:send-receive 'get-run-times #f (list runpatt targetpatt ))) 
 
 
 ;;======================================================================
 ;;  T E S T S
 ;;======================================================================
 
-;; Just some syntatic sugar
-(define (rmt:register-test run-id test-name item-path)
-  (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:general-call 'register-test run-id run-id test-name item-path))
-
-(define (rmt:get-test-id run-id testname item-path)
-  (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
-
-;; run-id is NOT used
-;;
-(define (rmt:get-test-info-by-id run-id test-id)
-  (if (number? test-id)
-      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
-      (begin
-	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
-	(print-call-chain (current-error-port))
-	#f)))
-
-(define (rmt:test-get-rundir-from-test-id run-id test-id)
-  (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))
-
-(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
-  (assert (number? run-id) "FATAL: Run id required.")
-  (let* ((test-path (if (string? work-area)
-			work-area
-			(rmt:test-get-rundir-from-test-id run-id test-id))))
-    (debug:print 3 *default-log-port* "TEST PATH: " test-path)
-    (open-test-db test-path)))
-
-;; WARNING: This currently bypasses the transaction wrapped writes system
-(define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment)
-  (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment)))
-
-(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus)
-  (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus)))
-
-(define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)
-  (assert (number? run-id) "FATAL: Run id required.")
-  ;; (if (number? run-id)
-  (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)))
-  ;;    (begin
-  ;;	(debug:print-error 0 *default-log-port* "rmt:get-tests-for-run called with bad run-id=" run-id)
-  ;;	(print-call-chain (current-error-port))
-  ;;	'())))
-
-(define (rmt:get-tests-for-run-state-status run-id testpatt last-update)
-  (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:send-receive 'get-tests-for-run-state-status run-id (list run-id testpatt last-update)))
-
-;; get stuff via synchash 
-(define (rmt:synchash-get run-id proc synckey keynum params)
-  (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:send-receive 'synchash-get run-id (list run-id proc synckey keynum params)))
-
-(define (rmt:get-tests-for-run-mindata run-id testpatt states status not-in)
-  (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-id testpatt states status not-in)))
-  
 ;; IDEA: Threadify these - they spend a lot of time waiting ...
 ;;
 (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in)
   (let ((multi-run-mutex (make-mutex))
 	(run-id-list (if run-ids
@@ -744,16 +439,10 @@
 
 (define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup)))
 
-;; state and status are extra hints not usually used in the calculation
-;;
-(define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment)
-  (assert (number? run-id) "FATAL: Run id required.")
-  (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment)))
-
 (define (rmt:set-state-status-and-roll-up-run run-id state status)
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status)))
 
 
@@ -829,10 +518,13 @@
 
 (define (rmt:get-run-state run-id)
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'get-run-state #f (list run-id)))
 
+(define (rmt:get-run-state-status run-id)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmt:send-receive 'get-run-state-status #f (list run-id)))
 
 (define (rmt:set-run-status run-id run-status #!key (msg #f))
   (assert (number? run-id) "FATAL: Run id required.")
   (rmt:send-receive 'set-run-status #f (list run-id run-status msg)))
 
@@ -1044,69 +736,42 @@
   (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id)))
 
 (define (rmt:test-get-archive-block-info archive-block-id)
   (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id)))
 
-
-(define (rmtmod:calc-ro-mode runremote *toppath*)
-  (if (and runremote
-	   (remote-ro-mode-checked runremote))
-      (remote-ro-mode runremote)
-      (let* ((mtcfgfile  (conc *toppath* "/megatest.config"))
-	     (ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
-	(if runremote
-	    (begin
-	      (remote-ro-mode-set! runremote ro-mode)
-	      (remote-ro-mode-checked-set! runremote #t)
-	      ro-mode)
-	    ro-mode))))
-
-(define (extras-readonly-mode rmt-mutex log-port cmd params)
-  (mutex-unlock! rmt-mutex)
-  (debug:print-info 12 log-port "rmt:send-receive, case 3")
-  (debug:print 0 log-port "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
-  #f)
-
-(define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params)
-  (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
-  (mutex-lock! *rmt-mutex*)
-  (remote-conndat-set!    runremote #f)
-  (http-transport:close-connections area-dat: runremote)
-  (remote-server-url-set! runremote #f)
-  (mutex-unlock! *rmt-mutex*)
-  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
-  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
-  
-(define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd)
-  (if (and (vector? res)
-	   (eq? (vector-length res) 2)
-	   (eq? (vector-ref res 1) 'overloaded)) ;; since we are
-						 ;; looking at the
-						 ;; data to carry the
-						 ;; error we'll use a
-						 ;; fairly obtuse
-						 ;; combo to minimise
-						 ;; the chances of
-						 ;; some sort of
-						 ;; collision.  this
-						 ;; is the case where
-						 ;; the returned data
-						 ;; is bad or the
-						 ;; server is
-						 ;; overloaded and we
-						 ;; want to ease off
-						 ;; the queries
-      (let ((wait-delay (+ attemptnum (* attemptnum 10))))
-	(debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
-	(mutex-lock! *rmt-mutex*)
-	(http-transport:close-connections area-dat: runremote)
-	(set! *runremote* #f) ;; force starting over
-	(mutex-unlock! *rmt-mutex*)
-	(thread-sleep! wait-delay)
-	(rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
-      res)) ;; All good, return res
-
-#;(set-functions rmt:send-receive                       remote-server-url-set!
-	       http-transport:close-connections	      remote-conndat-set!
-	       debug:print                            debug:print-info
-	       remote-ro-mode                         remote-ro-mode-set!
-	       remote-ro-mode-checked-set!            remote-ro-mode-checked)
+(define (rmtmod:calc-ro-mode runremote *toppath*)
+  (case (rmt:transport-mode)
+    ((http)
+     (if (and runremote
+	      (remote-ro-mode-checked runremote))
+	 (remote-ro-mode runremote)
+	 (let* ((mtcfgfile  (conc *toppath* "/megatest.config"))
+		(ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
+	   (if runremote
+	       (begin
+		 (remote-ro-mode-set! runremote ro-mode)
+		 (remote-ro-mode-checked-set! runremote #t)
+		 ro-mode)
+	       ro-mode))))
+    ((tcp)
+     (if (and runremote
+	      (tt-ro-mode-checked runremote))
+	 (tt-ro-mode runremote)
+	 (let* ((mtcfgfile  (conc *toppath* "/megatest.config"))
+		(ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
+	   (if runremote
+	       (begin
+		 (tt-ro-mode-set! runremote ro-mode)
+		 (tt-ro-mode-checked-set! runremote #t)
+		 ro-mode)
+	       ro-mode))))))
+
+;;======================================================================
+;; Maintenance
+;;======================================================================
+
+(define (rmt:find-and-mark-incomplete run-id ovr-deadtime)
+  (let* ((cfg-deadtime             (configf:lookup-number *configdat* "setup" "deadtime"))
+	 (test-stats-update-period (configf:lookup-number *configdat* "setup" "test-stats-update-period")))
+   (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period)
+   ;;call end of eud of run detection for posthook
+   (launch:end-of-run-check run-id)))

Index: rmtmod.scm
==================================================================
--- rmtmod.scm
+++ rmtmod.scm
@@ -17,69 +17,269 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;======================================================================
 
 (declare (unit rmtmod))
+(declare (uses debugprint))
 (declare (uses commonmod))
-(declare (uses apimod))
+(declare (uses dbfile))    ;; needed for records
+
+;; (declare (uses apimod))
 ;; (declare (uses apimod.import))
-(declare (uses ulex))
+;; (declare (uses ulex))
 
 ;; (include "ulex/ulex.scm")
 
 (module rmtmod
 	*
 	
-(import scheme chicken data-structures extras)
+(import scheme chicken data-structures extras matchable srfi-69)
 (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18)
-(import (prefix commonmod cmod:))
-(import apimod)
-(import (prefix ulex ulex:))
+(import commonmod dbfile debugprint) ;; (prefix commonmod cmod:))
+;; (import apimod)
+;; (import (prefix ulex ulex:))
+
+(include "db_records.scm")
 
 (defstruct alldat
   (areapath #f)
   (ulexdat  #f)
   )
 
-;;======================================================================
-;; return the handle struct for sending queries to a specific database
-;;  - initializes the connection object if this is the first access
-;;    - finds the "captain" and asks who to talk to for the given dbfname
-;;    - establishes the connection to the current dbowner
-;;
-#;(define (rmt:connect alldat dbfname dbtype)
-  (let* ((ulexdat    (or (alldat-ulexdat alldat)
-			 (rmt:setup-ulex alldat))))
-    (ulex:connect ulexdat dbfname dbtype)))
-
-;; setup the remote calls
-#;(define (rmt:setup-ulex alldat)
-  (let* ((udata (ulex:setup))) ;; establish connection to ulex
-    (alldat-ulexdat-set! alldat udata)
-    ;; register all needed procs
-    (ulex:register-handler udata 'ping cmod:get-full-version)  ;; override ping with get-full-version
-    (ulex:register-handler udata 'login cmod:get-full-version) ;; force setup of the connection
-    (ulex:register-handler udata 'execute api:execute-requests)
-    udata))
-
-;; set up a connection to the current owner of the dbfile associated with rid
-;; then send the query to that dbfile owner and wait for a response.
-;;
-#;(define (rmt:send-receive alldat cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
-  (let* (;; (alldat   *alldat*)
-	 (areapath (alldat-areapath alldat))
-	 (dbtype   (if (or (not rid)(< rid 1)) ;; this is the criteria for "main.db"
-		       "main" "runs"))
-	 (dbfname  (if (equal? dbtype "main")
-		       "main.db"
-		       (conc rid ".db")))
-	 (dbfile   (conc areapath "/.db/" dbfname))
-	 (ulexconn (rmt:connect alldat dbfname dbtype))  ;; ulexconn is our new *runremote*, it is a dbowner struct < pdat lastrefresh >
-	 (udata    (alldat-ulexdat alldat)))
-    	(ulex:remote-request udata ulexconn 'immediate dbfile 'execute rid params)))
-    ;; need to call this on the other side 
-    ;; (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))))
-    
-    #;(with-input-from-string
-	(ulex:remote-request udata ulexconn 'immediate dbfile 'execute rid (with-output-to-string (lambda ()(serialize params))))
-       (lambda ()(deserialize)))
+;; hold the send-receive proc in this parameter
+(define rmtmod:send-receive #f) ;; (make-parameter #f))
+
+;;======================================================================
+;; M I S C
+;;======================================================================
+
+;; hand off a call to one of the db:queries statements
+;; added run-id to make looking up the correct db possible 
+;;
+(define (rmt:general-call stmtname run-id . params)
+  (rmtmod:send-receive 'general-call run-id (append (list stmtname run-id) params)))
+
+
+
+;;======================================================================
+;; import an sexpr file into the db
+;;======================================================================
+
+(define (rmt:import-sexpr sexpr-file)
+  (if (file-exists? sexpr-file)
+      (let* ((data (with-input-from-file sexpr-file read)))
+	(for-each
+	 (lambda (targ-dat)
+	   (rmt:import-target targ-dat)) ;; ("target" ("run1" ("data" (1 ("field" . "value") ...
+	 data))
+      (let* ((msg (conc "ERROR: file "sexpr-file" not found")))
+	(debug:print 0 *default-log-port* msg)
+	(cons #f msg))))
+
+(define (rmt:import-target targ-dat)
+  (let* ((target (car targ-dat))
+	 (data   (cdr targ-dat)))
+    (for-each
+     (lambda (run-dat)
+       (rmt:import-run target run-dat)) ;; ("runname" ("data" ("testid" ("field" . "value") ...
+     data)))
+
+(define (rmt:import-run target run-dat)
+  (let* ((runname    (car run-dat))
+	 (all-dat    (cdr run-dat))
+	 (tests-data (alist-ref "data" all-dat equal?))
+	 (run-meta   (alist-ref "meta" all-dat equal?))
+	 (run-id     (rmt:insert-run target runname run-meta)))
+    (for-each
+     (lambda (test-dat)
+       (let* ((test-id  (car test-dat))
+	      (test-rec (cdr test-dat)))
+	 (rmt:insert-test run-id test-rec)))
+     tests-data)))
+
+;; insert run if not there, return id either way
+(define (rmt:insert-run target runname run-meta)
+  ;; look for id, return if found
+  (debug:print 0 *default-log-port* "Insert run: "target"/"runname)
+  (let* ((runs (rmtmod:send-receive 'simple-get-runs #f
+				    ;;    runpatt count offset target last-update)
+				    (list runname #f    #f     target #f))))
+    (if (null? runs)
+	(rmtmod:send-receive 'insert-run #f (list target runname run-meta))
+	(simple-run-id (car runs)))))
+
+(define (rmt:insert-test run-id test-rec)
+  (let* ((testname  (alist-ref "testname" test-rec equal?))
+	 (item-path (alist-ref "item_path" test-rec equal?)))
+    (debug:print 0 *default-log-port* "   Insert test in run "run-id": "testname"/"item-path)
+    (rmtmod:send-receive 'insert-test run-id test-rec)))
+
+;;======================================================================
+;;  T E S T S
+;;======================================================================
+
+;; Just some syntatic sugar
+(define (rmt:register-test run-id test-name item-path)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmt:general-call 'register-test run-id run-id test-name item-path))
+
+(define (rmt:get-test-id run-id testname item-path)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmtmod:send-receive 'get-test-id run-id (list run-id testname item-path)))
+
+(define (rmt:get-test-info-by-id run-id test-id)
+  (if (number? test-id)
+      (rmtmod:send-receive 'get-test-info-by-id run-id (list run-id test-id))
+      (begin
+	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
+	(print-call-chain (current-error-port))
+	#f)))
+
+(define (rmt:get-test-state-status-by-id run-id test-id)
+  (rmtmod:send-receive 'get-test-state-status-by-id run-id (list run-id test-id)))
+
+(define (rmt:test-get-rundir-from-test-id run-id test-id)
+  (rmtmod:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))
+
+;; (define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
+;;   (assert (number? run-id) "FATAL: Run id required.")
+;;   (let* ((test-path (if (string? work-area)
+;; 			work-area
+;; 			(rmt:test-get-rundir-from-test-id run-id test-id))))
+;;     (debug:print 3 *default-log-port* "TEST PATH: " test-path)
+;;     (open-test-db test-path)))
+
+;; WARNING: This currently bypasses the transaction wrapped writes system
+(define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmtmod:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment)))
+
+(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmtmod:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus)))
+
+(define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)
+  (assert (number? run-id) "FATAL: Run id required.")
+  ;; (if (number? run-id)
+  (rmtmod:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)))
+  ;;    (begin
+  ;;	(debug:print-error 0 *default-log-port* "rmt:get-tests-for-run called with bad run-id=" run-id)
+  ;;	(print-call-chain (current-error-port))
+  ;;	'())))
+
+(define (rmt:get-tests-for-run-state-status run-id testpatt last-update)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmtmod:send-receive 'get-tests-for-run-state-status run-id (list run-id testpatt last-update)))
+
+;; get stuff via synchash 
+(define (rmt:synchash-get run-id proc synckey keynum params)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmtmod:send-receive 'synchash-get run-id (list run-id proc synckey keynum params)))
+
+(define (rmt:get-tests-for-run-mindata run-id testpatt states status not-in)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmtmod:send-receive 'get-tests-for-run-mindata run-id (list run-id testpatt states status not-in)))
+  
+;; state and status are extra hints not usually used in the calculation
+;;
+(define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment)
+  (assert (number? run-id) "FATAL: Run id required.")
+  (rmtmod:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment)))
+
+
+;;======================================================================
+;; Maintenance
+;;======================================================================
+
+
+(define (rmt:get-toplevels-and-incompletes run-id running-deadtime remotehoststart-deadtime)
+  (rmtmod:send-receive 'get-toplevels-and-incompletes run-id (list run-id running-deadtime remotehoststart-deadtime)))
+
+(define (rmt:get-status-from-final-status-file run-dir)
+  (let ((infile (conc run-dir "/.final-status")))
+    ;; first verify we are able to write the output file
+    (if (not (file-read-access? infile))
+        (begin 
+	  (debug:print 2 *default-log-port* "ERROR: cannot read " infile)
+          (debug:print 2 *default-log-port* "ERROR: run-dir is " run-dir)
+          #f
+          )
+        (with-input-from-file infile read-lines)
+	)))
+  
+;;  select end_time-now from
+;;      (select testname,item_path,event_time+run_duration as
+;;                          end_time,strftime('%s','now') as now from tests where state in
+;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
+;;
+;; NOT EASY TO MIGRATE TO db{file,mod}
+;;
+(define (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period)
+  (let* ((incompleted '())
+	 (oldlaunched '())
+	 (toplevels   '())
+          ;; The default running-deadtime is 720 seconds = 12 minutes.
+          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
+         (deadtime-trim (or ovr-deadtime cfg-deadtime))
+         (server-start-allowance 200)
+         (server-overloaded-budget 200)
+         (launch-monitor-off-time (or test-stats-update-period 30))
+         (launch-monitor-on-time-budget 30)
+         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
+         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
+         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
+         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
+         (running-deadtime (or deadtime-trim running-deadtime-default))) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
+
+    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
+    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)
+
+    (let* ((dat (rmt:get-toplevels-and-incompletes run-id running-deadtime remotehoststart-deadtime)))
+      (set! oldlaunched (list-ref dat 1))
+      (set! toplevels   (list-ref dat 2))
+      (set! incompleted (list-ref dat 0)))
+
+    (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
+		      (length toplevels) " old LAUNCHED toplevel tests and "
+		      (length incompleted) " tests marked RUNNING but apparently dead.")
+  
+    ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
+    ;;
+    ;; (db:delay-if-busy dbdat)
+    (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
+	   (all-ids             (append min-incompleted-ids (map car oldlaunched))))
+      (if (> (length all-ids) 0)
+	  (begin
+	    ;; (launch:is-test-alive "localhost" 435)
+	    (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
+			 " as DEAD")
+	    (for-each
+             (lambda (test-id)
+               (let* ((tinfo   (rmt:get-test-info-by-id run-id test-id))
+		      (run-dir (db:test-get-rundir     tinfo))
+		      (host    (db:test-get-host       tinfo))
+		      (pid     (db:test-get-process_id tinfo))
+		      (result (rmt:get-status-from-final-status-file run-dir)))
+		 (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
+		     (begin
+		       (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
+		       (rmt:set-state-status-and-roll-up-items
+			run-id test-id 'foo "COMPLETED" "PASS"
+			"Test stopped responding but it has PASSED; marking it PASS in the DB."))
+		     (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
+					  (commonmod:is-test-alive host pid))))
+		       (if is-alive
+			   (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
+					" has a process on pid " pid ", NOT setting to DEAD.")
+			   (begin
+			     (debug:print 0 *default-log-port* "INFO: test " test-id
+					  " final state/status is not COMPLETED/PASS. It is " result)
+			     (rmt:set-state-status-and-roll-up-items
+			      run-id test-id 'foo "COMPLETED" "DEAD"
+			      "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
+	     ;; call end of eud of run detection for posthook - from merge, is it needed?
+	     ;; (launch:end-of-run-check run-id)
+	     all-ids)
+	    )))))
+
+
 )

Index: runconfig.scm
==================================================================
--- runconfig.scm
+++ runconfig.scm
@@ -22,10 +22,15 @@
 
 (use format directory-utils)
 
 (declare (unit runconfig))
 (declare (uses common))
+(declare (uses debugprint))
+(declare (uses commonmod))
+
+(import commonmod
+	debugprint)
 
 (include "common_records.scm")
 
 (define (runconfig:read fname target environ-patt)
   (let ((ht (make-hash-table)))

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -15,33 +15,45 @@
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')
 
-(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
-     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer
-     sxml-modifications matchable)
-
 (declare (unit runs))
 (declare (uses db))
 (declare (uses common))
+(declare (uses debugprint))
+(declare (uses commonmod))
 (declare (uses items))
 (declare (uses runconfig))
 (declare (uses tests))
 (declare (uses server))
 (declare (uses mt))
 (declare (uses archive))
-;; (declare (uses filedb))
+(declare (uses mtargs))
+(declare (uses rmtmod))
+(declare (uses dbfile))
+
+(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
+     posix-extras directory-utils pathname-expand typed-records format  sxml-serializer
+     sxml-modifications matchable)
+
+
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
 (include "run_records.scm")
 (include "test_records.scm")
 
 ;; (include "debugger.scm")
 
+(import commonmod
+	debugprint
+	rmtmod
+	dbfile
+	(prefix mtargs args:))
+
 ;; use this struct to facilitate refactoring
 ;;
 
 (defstruct runs:dat
   reglen regfull
@@ -801,48 +813,29 @@
     (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records))
     (let ((reglen (configf:lookup *configdat* "setup" "runqueue")))
       (if (> (length (hash-table-keys test-records)) 0)
 	  (let* ((keep-going        #t)
 		 (run-queue-retries 5)
-		;; (th1        (make-thread (lambda ()
-		;; 			    (handle-exceptions
-		;; 				exn
-		;; 				(begin
-		;; 				  (print-call-chain)
-		;; 				  (print " message: " ((condition-property-accessor 'exn 'message) exn)))
-		;; 			      (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
-		;; 						    (any->number reglen) all-tests-registry)))
-		;; 			  "runs:run-tests-queue"))
-		 (th2        (make-thread (lambda ()			 ;; BBQ: why are we visiting ALL runs here?	    
-					    ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
-					    (let ((run-ids (rmt:get-all-run-ids)))
-					      (for-each (lambda (run-id)
-							  (if keep-going
-							      (handle-exceptions
-							       exn
-							       (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
-							       (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
-							run-ids)))
-					  "runs: mark-incompletes")))
-	    ;; (thread-start! th1)
-	    (thread-start! th2)
-	    ;; (thread-join! th1)
-	    ;; just do the main stuff in the main thread
+		 (run-ids (rmt:get-all-run-ids)))
+	    #;(for-each (lambda (run-id)
+			(if keep-going
+			    (handle-exceptions
+				exn
+			      (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
+			      (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
+		      run-ids)
 	    (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests
-								    (any->number reglen) all-tests-registry)
+				  (any->number reglen) all-tests-registry)
 	    (set! keep-going #f)
-	    (thread-join! th2)
-	    ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD
 	    (if (> run-count 0) ;; handle reruns
 		(begin
 		  (if (not (hash-table-ref/default flags "-preclean" #f))
 		      (hash-table-set! flags "-preclean" #t))
 		  (if (not (hash-table-ref/default flags "-rerun" #f))
 		      (hash-table-set! flags "-rerun" "ABORT,STUCK/DEAD,n/a,ZERO_ITEMS"))
-		  ;; recursive call to self
       (runs:run-tests target runname test-patts user flags run-count: (- run-count 1)))
-                  (launch:end-of-run-check run-id)))
+                (launch:end-of-run-check run-id)))
 	  (debug:print-info 0 *default-log-port* "No tests to run")))
     (debug:print-info 4 *default-log-port* "All done by here")
     ;; TODO: try putting post hook call here
       
     ;  (debug:print-info 2 *default-log-port* " run-count " run-count)
@@ -1279,15 +1272,23 @@
       (list hed tal reg reruns))
      
      ;; If no resources are available just kill time and loop again
      ;;
      ((not have-resources) ;; simply try again after waiting a second
-      (if (runs:lownoise "no resources" 60)
+      (if (runs:lownoise "no resources" 600)
 	  (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ..."))
+
       ;; Have gone back and forth on this but db starvation is an issue.
       ;; wait one second before looking again to run jobs.
-      (thread-sleep! 0.25)
+      ;; (thread-sleep! 0.25)
+      
+      ;; new logic.
+      ;; If it has been more than 10 seconds since we were last here don't wait at all
+      ;; otherwise sleep 2 seconds to give db a rest and let dashboard read data
+      (if (runs:lownoise "frequent-no-resources" 10)
+	  (thread-sleep! 0.25) ;; no significant delay
+	  (thread-sleep! 2))
       ;; could have done hed tal here but doing car/cdr of newtal to rotate tests
       (list (car newtal)(cdr newtal) reg reruns))
      
      ;; This is the final stage, everything is in place so launch the test
      ;;
@@ -1544,10 +1545,11 @@
          (max-retries           (configf:lookup *configdat* "setup" "maxretries"))
          (max-concurrent-jobs   (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50))
          (reglen                (if (number? reglen-in) reglen-in 1))
          (last-time-incomplete  (- (current-seconds) 900)) ;; force at least one clean up cycle
          (last-time-some-running (current-seconds))
+	 (incoming-tests        '()) ;; queue up incoming tests here to tack on to tal when it gets low
          ;; (tdbdat                (tasks:open-db))
          (runsdat (make-runs:dat
                    ;; hed: hed
                    ;; tal: tal
                    ;; reg: reg
@@ -1775,11 +1777,11 @@
 			(last-jobs-check-time  (runs:dat-last-jobs-check-time runsdat))
 			(should-check-jobs     (match can-run-more-tests
 						 ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params)
 						  (if (< (- max-concurrent-jobs num-running) 25)
 						      (begin
-							(debug:print-info 0 *default-log-port*
+							(debug:print-info 2 *default-log-port*
 									  "less than 20 jobs headroom, ("max-concurrent-jobs
 									  "-"num-running")>20. Forcing prelaunch check.")
 							#t)
 						      #f))
 						 (else #f)))) ;; no record yet
@@ -1855,14 +1857,22 @@
                                    (newtestname (db:test-make-full-name hed my-item-path)))    ;; test names are unique on testname/item-path
                               (tests:testqueue-set-items!     new-test-record #f)
                               (tests:testqueue-set-itemdat!   new-test-record my-itemdat)
                               (tests:testqueue-set-item_path! new-test-record my-item-path)
                               (hash-table-set! test-records newtestname new-test-record)
-                              (set! tal (append tal (list newtestname)))))  ;; since these are itemized create new test names testname/itempath
+			      ;; BUG: This next line sucks up a lot of horsepower
+			      ;; (set! tal (append tal (list newtestname)))
+			      ;; (set! tal (cons newtestname tal)) ;; 4/6/2023 - try using cons, does it matter if the test gets added at the beginning?
+			      (set! incoming-tests (cons newtestname incoming-tests))
+			      ))  ;; since these are itemized create new test names testname/itempath
                           items-in-testpatt)))
           
-          
+	  (if (and (< (length tal) 20)
+		   (not (null? incoming-tests)))
+	      (begin
+		(set! tal (append tal (reverse incoming-tests)))
+		(set! incoming-tests '())))
 
 	  ;; At this point we have possibly added items to tal but all must be handed off to 
 	  ;; INNER COND logic. I think loop without rotating the queue 
 	  ;; (loop hed tal reg reruns))
 	  ;; (let ((newtal (append tal (list hed))))  ;; We should discard hed as it has been expanded into it's items? Yes, but only if this *is* an itemized test
@@ -2380,11 +2390,11 @@
 	 (bup-mutex    (make-mutex))
          (keep-records (args:get-arg "-keep-records")) ;; used in conjunction with -remove-runs to keep the records, TODO: consolidate this with "mode".
 	 (test-records '())) ;; for tasks that we wish to operate on all tests in one fell swoop
 
     (let* ((write-access-actions '(remove-runs set-state-status archive run-wait kill-runs))
-           (dbfile             (conc  *toppath* "/.megatest/main.db"))
+           (dbfile             (conc  *toppath* "/.mtdb/main.db"))
            (readonly-mode      (not (file-write-access? dbfile))))
       (when (and readonly-mode
                  (member action write-access-actions))
         (debug:print-error 0 *default-log-port* dbfile " is readonly.  Cannot proceed with action ["action"] in which write-access isrequired .")
         (exit 1)))

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -14,31 +14,31 @@
 ;; 
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 
-(require-extension (srfi 18) extras tcp s11n)
-
-(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest
-     directory-utils posix-extras matchable utils)
-
-(use spiffy uri-common intarweb http-client spiffy-request-vars)
-
 (declare (unit server))
 
 (declare (uses commonmod))
-
+(declare (uses debugprint))
 (declare (uses common))
 (declare (uses db))
 (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.
 ;; (declare (uses synchash))
-(declare (uses http-transport))
 ;;(declare (uses rpc-transport))
 (declare (uses launch))
 ;; (declare (uses daemon))
+(declare (uses mtargs))
+
+(use (srfi 18) extras s11n)
+(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest)
+(use directory-utils posix-extras matchable utils)
+(use spiffy uri-common intarweb http-client spiffy-request-vars)
 
-(import commonmod)
+(import commonmod
+	debugprint
+	(prefix mtargs args:))
 
 (include "common_records.scm")
 (include "db_records.scm")
 
 (define (server:make-server-url hostport)
@@ -65,21 +65,10 @@
 ;;======================================================================
 
 ;; Call this to start the actual server
 ;;
 
-;; all routes though here end in exit ...
-;;
-;; start_server
-;;
-(define (server:launch run-id transport-type)
-  (case transport-type
-    ((http)(http-transport:launch))
-    ;;((nmsg)(nmsg-transport:launch run-id))
-    ;;((rpc)  (rpc-transport:launch run-id))
-    (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))))
-
 ;;======================================================================
 ;; S E R V E R   U T I L I T I E S 
 ;;======================================================================
 
 ;; Get the transport
@@ -112,39 +101,32 @@
   (if *server-id* *server-id*
       (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic
         (set! *server-id* sig)
         *server-id*)))
 
-;; When using zmq this would send the message back (two step process)
-;; with spiffy or rpc this simply returns the return data to be returned
-;; 
-(define (server:reply return-addr query-sig success/fail result)
-  (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
-  ;; (send-message pubsock target send-more: #t)
-  ;; (send-message pubsock 
-  (case (server:get-transport)
-    ((rpc)  (db:obj->string (vector success/fail query-sig result)))
-    ((http) (db:obj->string (vector success/fail query-sig result)))
-    ((fs)   result)
-    (else 
-     (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
-     result)))
+;; ;; When using zmq this would send the message back (two step process)
+;; ;; with spiffy or rpc this simply returns the return data to be returned
+;; ;; 
+;; (define (server:reply return-addr query-sig success/fail result)
+;;   (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
+;;   ;; (send-message pubsock target send-more: #t)
+;;   ;; (send-message pubsock 
+;;   (case (server:get-transport)
+;;     ((rpc)  (db:obj->string (vector success/fail query-sig result)))
+;;     ((http) (db:obj->string (vector success/fail query-sig result)))
+;;     ((fs)   result)
+;;     (else 
+;;      (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
+;;      result)))
 
 ;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
 ;; if the target-host is set 
 ;; try running on that host
 ;;   incidental: rotate logs in logs/ dir.
 ;;
 (define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
-  (let* (;; (curr-host   (get-host-name))
-         ;; (attempt-in-progress (server:start-attempted? areapath))
-         ;; (dot-server-url (server:check-if-running areapath))
-	 ;; (curr-ip     (server:get-best-guess-address curr-host))
-	 ;; (curr-pid    (current-process-id))
-	 ;; (homehost    (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
-	 ;; (target-host (car homehost))
-	 (testsuite   (common:get-testsuite-name))
+  (let* ((testsuite   (common:get-testsuite-name))
 	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
 	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
 			   ""))
 	 (cmdln (conc (common:get-megatest-exe)
 		      " -server - ";; (or target-host "-")
@@ -190,46 +172,48 @@
   (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id
         (dbprep-rx    (regexp "^SERVER: dbprep"))
         (dbprep-found 0)
 	(bad-dat      (list #f #f #f #f #f)))
     (handle-exceptions
-	exn
-      (begin
-	(debug:print-info 0 *default-log-port* "Unable to get server info from " logf ", exn=" exn)
-	bad-dat) ;; no idea what went wrong, call it a bad server
-      (with-input-from-file
-	  logf
-	(lambda ()
-	  (let loop ((inl  (read-line))
-		     (lnum 0))
-	    (if (not (eof-object? inl))
-		(let ((mlst (string-match server-rx inl))
-                      (dbprep (string-match dbprep-rx inl)))
-                  (if dbprep (set! dbprep-found 1))
-		  (if (not mlst)
-		      (if (< lnum 500) ;; give up if more than 500 lines of server log read
-			  (loop (read-line)(+ lnum 1))
-			  (begin 
+     exn
+     (begin
+       ;; WARNING: this is potentially dangerous to blanket ignore the errors
+       (if (file-exists? logf)
+	   (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn))
+       bad-dat) ;; no idea what went wrong, call it a bad server
+     (with-input-from-file
+	 logf
+       (lambda ()
+	 (let loop ((inl  (read-line))
+		    (lnum 0))
+	   (if (not (eof-object? inl))
+	       (let ((mlst (string-match server-rx inl))
+		     (dbprep (string-match dbprep-rx inl)))
+		 (if dbprep (set! dbprep-found 1))
+		 (if (not mlst)
+		     (if (< lnum 500) ;; give up if more than 500 lines of server log read
+			 (loop (read-line)(+ lnum 1))
+			 (begin 
                            (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )
                            bad-dat))
-		      (match mlst
-			((_ host port start server-id pid)
-			 (list host
-			       (string->number port)
-			       (string->number start)
-			       server-id
-			       (string->number pid)))
-			(else
-			 (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst)
-			 bad-dat))))
-                (begin 
-                  (if dbprep-found
-                      (begin
-                         (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
-                         (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
-                      (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
-		  bad-dat))))))))
+		     (match mlst
+			    ((_ host port start server-id pid)
+			     (list host
+				   (string->number port)
+				   (string->number start)
+				   server-id
+				   (string->number pid)))
+			    (else
+			     (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst)
+			     bad-dat))))
+	       (begin 
+		 (if dbprep-found
+		     (begin
+		       (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
+		       (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
+		     (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
+		 bad-dat))))))))
 
 ;; ;; get a list of servers from the log files, with all relevant data
 ;; ;; ( mod-time host port start-time pid )
 ;; ;;
 ;; (define (server:get-list areapath #!key (limit #f))
@@ -419,11 +403,12 @@
 
 ;; oldest server alive determines host then choose random of youngest
 ;; five servers on that host
 ;;
 (define (server:get-servers-info areapath)
-  (let* ((servinfodir (conc *toppath*"/.servinfo")))
+  ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.")
+  (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo")))
     (if (not (file-exists? servinfodir))
 	(create-directory servinfodir))
     (let* ((allfiles    (glob (conc servinfodir"/*")))
 	   (res         (make-hash-table)))
       (for-each
@@ -432,15 +417,45 @@
 		(serverdat (server:logf-get-start-info f)))
 	   (match serverdat
 	     ((host port start server-id pid)
 	      (if (and host port start server-id pid)
 		  (hash-table-set! res hostport serverdat)
-		  (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat)))
+		  (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))
 	     (else
-	      (debug:print-info 0 *default-log-port* "bad server info for "f": "serverdat)))))
+	      (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))))
        allfiles)
       res)))
+
+;; check the .servinfo directory, are there other servers running on this
+;; or another host?
+;;
+;; returns #t => ok to start another server
+;;         #f => not ok to start another server
+;;
+(define (server:minimal-check areapath)
+  (server:clean-up-old areapath)
+  (let* ((srvdir      (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo"))
+	 (servrs      (glob (conc srvdir"/*")))
+	 (thishostip  (server:get-best-guess-address (get-host-name)))
+	 (thisservrs  (glob (conc srvdir"/"thishostip":*")))
+	 (homehostinf (server:choose-server areapath 'homehost))
+	 (havehome    (car homehostinf))
+	 (wearehome   (cdr homehostinf)))
+    (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome
+		      ", numservers: "(length thisservrs))
+    (cond
+     ((not havehome) #t) ;; no homehost yet, go for it
+     ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another
+     ((and havehome (not wearehome)) #f)     ;; we are not the home host
+     ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running
+     (else
+      (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs)
+      #t))))
+	 
+
+(define server-last-start 0)
+
 
 ;; oldest server alive determines host then choose random of youngest
 ;; five servers on that host
 ;;
 ;; mode:
@@ -453,29 +468,47 @@
   ;;   1. sort by age ascending and ping until good
   ;; find alive rand from youngest
   ;;   1. sort by age descending
   ;;   2. take five
   ;;   3. check alive, discard if not and repeat
+  ;; first we clean up old server files
+  (assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode))
+  (server:clean-up-old areapath)
+  (let* ((since-last (- (current-seconds) server-last-start))
+        (server-start-delay 10))     
+    (if ( < (- (current-seconds) server-last-start) 10 )
+      (begin
+        (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
+        (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds")
+        (thread-sleep! server-start-delay)
+      )
+      (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
+    )
+  )
   (let* ((serversdat  (server:get-servers-info areapath))
 	 (servkeys    (hash-table-keys serversdat))
-	 (by-time-asc (if (not (null? servkeys))
+	 (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last
 			  (sort servkeys ;; list of "host:port"
 				(lambda (a b)
 				  (>= (list-ref (hash-table-ref serversdat a) 2)
 				      (list-ref (hash-table-ref serversdat b) 2))))
 			  '())))
+    (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat)
+    (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys)
     (if (not (null? by-time-asc))
 	(let* ((oldest     (last by-time-asc))
 	       (oldest-dat (hash-table-ref serversdat oldest))
 	       (host       (list-ref oldest-dat 0))
 	       (all-valid  (filter (lambda (x)
 				     (equal? host (list-ref (hash-table-ref serversdat x) 0)))
 				   by-time-asc))
-	       (best-five  (lambda ()
-			     (if (> (length all-valid) 5)
-				 (take all-valid 5)
-				 all-valid)))
+	       (best-ten  (lambda ()
+			     (if (> (length all-valid) 11)
+				 (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out
+				 (if (> (length all-valid) 8)
+				     (drop-right all-valid 1)
+				     all-valid))))
 	       (names->dats (lambda (names)
 			      (map (lambda (x)
 				     (hash-table-ref serversdat x))
 				   names)))
 	       (am-home?    (lambda ()
@@ -483,44 +516,81 @@
 				     (bestadrs (server:get-best-guess-address currhost)))
 				(or (equal? host currhost)
 				    (equal? host bestadrs))))))
 	  (case mode
 	    ((info)
-	     (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
-	     (print "youngest: "(hash-table-ref serversdat (car all-valid))))
+	     (debug:print 0 *default-log-port* "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
+	     (debug:print 0 *default-log-port* "youngest: "(hash-table-ref serversdat (car all-valid))))
 	    ((home)     host)
 	    ((homehost) (cons host (am-home?))) ;; shut up old code
 	    ((home?)    (am-home?))
-	    ((best-five)(names->dats (best-five)))
+	    ((best-ten)(names->dats (best-ten)))
 	    ((all-valid)(names->dats all-valid))
-	    ((best)     (let* ((best-five (best-five))
-			       (len       (length best-five)))
-			  (hash-table-ref serversdat (list-ref best-five (random len)))))
+	    ((best)     (let* ((best-ten (best-ten))
+			       (len       (length best-ten)))
+			  (hash-table-ref serversdat (list-ref best-ten (random len)))))
 	    ((count)(length all-valid))
 	    (else
 	     (debug:print 0 *default-log-port* "ERROR: invalid command "mode)
 	     #f)))
 	(begin
 	  (server:run areapath)
-	  (thread-sleep! 3)
+          (set! server-last-start (current-seconds))
+	  ;; (thread-sleep! 3)
 	  (case mode
 	    ((homehost) (cons #f #f))
 	    (else	#f))))))
+
+(define (server:get-servinfo-dir areapath)
+  (let* ((spath (conc areapath"/.servinfo")))
+    (if (not (file-exists? spath))
+	(create-directory spath #t))
+    spath))
+
+(define (server:clean-up-old areapath)
+  ;; any server file that has not been touched in ten minutes is effectively dead
+  (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*"))))
+    (for-each
+     (lambda (sfile)
+       (let* ((modtime (handle-exceptions
+			   exn
+			 (begin
+			   (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile)
+			   (current-seconds))
+			 (file-modification-time sfile))))
+	 (if (and (number? modtime)
+		  (> (- (current-seconds) modtime)
+		     600))
+	     (begin
+	       (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.")
+	       (handle-exceptions
+		   exn
+		 (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile)
+		 (delete-file sfile))))))
+     sfiles)))
 
 ;; would like to eventually get rid of this
 ;;
 (define (common:on-homehost?)
-  (server:choose-server *toppath* 'home?))
+  (if (eq? (rmt:transport-mode) 'http)
+      (server:choose-server *toppath* 'home?)
+      #t)) ;; there is no homehost for tcp and nfs is always on home so #t should work
 
 ;; kind start up of server, wait before allowing another server for a given
 ;; area to be launched
 ;;
 (define (server:kind-run areapath)
   ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
   ;; and wait for it to be at least <server idletime> seconds old
   ;; (server:wait-for-server-start-last-flag areapath)
-  (if (< (server:choose-server areapath 'count) 10)
+  (let loop ()
+    (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2)
+	(begin
+	  (if (common:low-noise-print 30 "our-host-load")
+	      (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server."))
+	  (loop))))
+  (if (< (server:choose-server areapath 'count) 20)
       (server:run areapath))
   #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
       (let* ((lock-file    (conc areapath "/logs/server-start.lock")))
 	(let* ((start-flag (conc areapath "/logs/server-start-last")))
 	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
@@ -538,11 +608,12 @@
     (let loop ((server-info (server:check-if-running areapath))
 	       (try-num    0))
       (if (or server-info
 	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
 	  (server:record->url server-info)
-	  (let ((num-ok (length (server:choose-server areapath 'all-valid))))
+	  (let* ( (servers (server:choose-server areapath 'all-valid))
+                (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0)))
 	    (if (and (> try-num 0)  ;; first time through simply wait a little while then try again
 		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
 		(server:run areapath))
 	    (thread-sleep! 5)
 	    (loop (server:check-if-running areapath)
@@ -555,11 +626,11 @@
 
 ;; no longer care if multiple servers are started by accident. older servers will drop off in time.
 ;;
 (define (server:check-if-running areapath) ;;  #!key (numservers "2"))
   (let* ((ns            (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
-	 (servers       (server:choose-server areapath 'best-five))) ;; (server:get-best (server:get-list areapath))))
+	 (servers       (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath))))
     (if (or (and servers
 		 (null? servers))
 	    (not servers))
 	    ;; (and (list? servers)
 	    ;;	 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
@@ -587,92 +658,91 @@
   (handle-exceptions
     exn
     (begin 
       (debug:print-info 0 *default-log-port*  "Unable to get host and/or port from " servr ", exn=" exn)     
     #f)
-  (match-let (((mod-time hostname port start-time server-id pid)
+  (match-let (((hostname port start-time server-id pid)
 	       servr))
     (tasks:kill-server hostname pid))))
 
-;; called in megatest.scm, host-port is string hostname:port
-;;
-;; NOTE: This is NOT called directly from clients as not all transports support a client running
-;;       in the same process as the server.
-;;
-(define (server:ping host-port-in server-id #!key (do-exit #f))
-  (let ((host:port (if (not host-port-in) ;; use read-dotserver to find
-		       #f ;; (server:check-if-running *toppath*)
-		;; (if (number? host-port-in) ;; we were handed a server-id
-		;; 	   (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
-		;; 	     ;; (print "srec: " srec " host-port-in: " host-port-in)
-		;; 	     (if srec
-		;; 		 (conc (vector-ref srec 3) ":" (vector-ref srec 4))
-		;; 		 (conc "no such server-id " host-port-in)))
-		       host-port-in))) ;; )
-    (let* ((host-port (if host:port
-			  (let ((slst (string-split   host:port ":")))
-			    (if (eq? (length slst) 2)
-				(list (car slst)(string->number (cadr slst)))
-				#f))
-			  #f)))
-;;	   (toppath       (launch:setup)))
-      ;; (print "host-port=" host-port)
-      (if (not host-port)
-	  (begin
-	    (if host-port-in
-		(debug:print 0 *default-log-port*  "ERROR: bad host:port"))
-	    (if do-exit (exit 1))
-	    #f)
-	  (let* ((iface      (car host-port))
-		 (port       (cadr host-port))
-		 (server-dat (http-transport:client-connect iface port server-id))
-		 (login-res  (rmt:login-no-auto-client-setup server-dat)))
-	    (if (and (list? login-res)
-		     (car login-res))
-		(begin
-		  ;; (print "LOGIN_OK")
-		  (if do-exit (exit 0))
-		  #t)
-		(begin
-		  ;; (print "LOGIN_FAILED")
-		  (if do-exit (exit 1))
-		  #f)))))))
-
-;; run ping in separate process, safest way in some cases
-;;
-(define (server:ping-server ifaceport)
-  (with-input-from-pipe 
-   (conc (common:get-megatest-exe) " -ping " ifaceport)
-   (lambda ()
-     (let loop ((inl (read-line))
-		(res "NOREPLY"))
-       (if (eof-object? inl)
-	   (case (string->symbol res)
-	     ((NOREPLY)  #f)
-	     ((LOGIN_OK) #t)
-	     (else       #f))
-	   (loop (read-line) inl))))))
-
-;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
-;;
-(define (server:login toppath)
-  (lambda (toppath)
-    (set! *db-last-access* (current-seconds)) ;; might not be needed.
-    (if (equal? *toppath* toppath)
-	#t
-	#f)))
+;; ;; called in megatest.scm, host-port is string hostname:port
+;; ;;
+;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running
+;; ;;       in the same process as the server.
+;; ;;
+;; (define (server:ping host:port server-id #!key (do-exit #f))
+;;   (let* ((host-port (cond
+;; 		     ((string? host:port)
+;; 		      (let ((slst (string-split   host:port ":")))
+;; 			(if (eq? (length slst) 2)
+;; 			    (list (car slst)(string->number (cadr slst)))
+;; 			    #f)))
+;; 		     (else
+;; 		      #f))))
+;;     (cond
+;;      ((and (list? host-port)
+;; 	   (eq? (length host-port) 2))
+;;       (let* ((myrunremote (make-and-init-remote *toppath*))
+;; 	     (iface       (car host-port))
+;; 	     (port        (cadr host-port))
+;; 	     (server-dat  (client:connect iface port server-id myrunremote))
+;; 	     (login-res   (rmt:login-no-auto-client-setup myrunremote)))
+;; 	(http-transport:close-connections myrunremote)
+;; 	(if (and (list? login-res)
+;; 		 (car login-res))
+;; 	    (begin
+;; 	      ;; (print "LOGIN_OK")
+;; 	      (if do-exit (exit 0))
+;; 	      #t)
+;; 	    (begin
+;; 	      ;; (print "LOGIN_FAILED")
+;; 	      (if do-exit (exit 1))
+;; 	      #f))))
+;;      (else 
+;;       (if host:port
+;; 	  (debug:print 0 *default-log-port*  "ERROR: bad host:port "host:port))
+;;       (if do-exit
+;; 	  (exit 1)
+;; 	  #f)))))
+;; 
+;; ;; run ping in separate process, safest way in some cases
+;; ;;
+;; (define (server:ping-server ifaceport)
+;;   (with-input-from-pipe 
+;;    (conc (common:get-megatest-exe) " -ping " ifaceport)
+;;    (lambda ()
+;;      (let loop ((inl (read-line))
+;; 		(res "NOREPLY"))
+;;        (if (eof-object? inl)
+;; 	   (case (string->symbol res)
+;; 	     ((NOREPLY)  #f)
+;; 	     ((LOGIN_OK) #t)
+;; 	     (else       #f))
+;; 	   (loop (read-line) inl))))))
+;; 
+;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
+;; ;;
+;; (define (server:login toppath)
+;;   (lambda (toppath)
+;;     (set! *db-last-access* (current-seconds)) ;; might not be needed.
+;;     (if (equal? *toppath* toppath)
+;; 	#t
+;; 	#f)))
 
 ;; timeout is hms string: 1h 5m 3s, default is 1 minute
 ;; This is currently broken. Just use the number of hours with no unit.
 ;; Default is 60 seconds.
 ;;
 (define (server:expiration-timeout)
-  (let ((tmo (configf:lookup *configdat* "server" "timeout")))
-    (if (and (string? tmo)
-	     (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below
-        (* 3600 (string->number tmo))
-	60)))
+  (let* ((tmo (configf:lookup *configdat* "server" "timeout")))
+    (if (string? tmo)
+	(let* ((num (string->number tmo)))
+	  (if num
+	      (* 3600 num)
+	      (common:hms-string->seconds tmo)))
+	600 ;; this is the default
+	)))
 
 (define (server:get-best-guess-address hostname)
   (let ((res #f))
     (for-each 
      (lambda (adr)
@@ -683,125 +753,13 @@
     (string-intersperse 
      (map number->string
 	  (u8vector->list
 	   (if res res (hostname->ip hostname)))) ".")))
 
-;; (define server:sync-lock-token "SERVER_SYNC_LOCK")
-;; (define (server:release-sync-lock)
-;;   (db:no-sync-del! *no-sync-db* server:sync-lock-token))
-;; (define (server:have-sync-lock?)
-;;   (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token))
-;;          (have-lock?     (car have-lock-pair))
-;;          (lock-time      (cdr have-lock-pair))
-;;          (lock-age       (- (current-seconds) lock-time)))
-;;     (cond
-;;      (have-lock? #t)
-;;      ((>lock-age
-;;        (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180)))
-;;       (server:release-sync-lock)
-;;       (server:have-sync-lock?))
-;;      (else #f))))
-
 ;; moving this here as it needs access to db and cannot be in common.
 ;;
 
 (define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f))
   (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")
   (lambda ()
-    (debug:print "WARNING: bruteforce-syncer is called but has been disabled!"))
-  #;(let* ((sqlite-exe   (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh
-         (sync-log     (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log")))
-	 (tmp-area     (common:get-db-tmp-area))
-	 (tmp-db       (conc tmp-area "/megatest.db"))
-	 (staging-file (conc *toppath* "/.megatest.db"))
-	 (mtdbfile     (conc *toppath* "/megatest.db"))
-	 (lockfile     (common:get-sync-lock-filepath))
-         (sync-cmd-core     (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log))
-         (sync-cmd     (if fork-to-background 
-                           (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"")
-                           sync-cmd-core))
-         (default-min-intersync-delay 2)
-	 (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay))
-         (default-duty-cycle 0.1)
-         (duty-cycle   (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle))
-         (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle)
-         (calculate-off-time (lambda (work-duration duty-cycle)
-                                  (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds)))
-         (off-time min-intersync-delay) ;; adjusted in closure below.
-         (do-a-sync
-          (lambda ()
-            (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync)
-            (let* ((finalres
-                    (let retry-loop ((num-tries 0))
-                         (if (common:simple-file-lock lockfile)
-	                     (begin
-                               (cond
-                                ((not (or fork-to-background persist-until-sync))
-                                 (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay
-                                              " , off-time="off-time" seconds ]")
-                                 (thread-sleep! (max off-time min-intersync-delay)))
-                                (else
-                                 (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit...")))
-
-                               (if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
-                                   (common:snapshot-file mtdbfile subdir: ".db-snapshot"))
-		               (delete-file* staging-file)
-		               (let* ((start-time (current-milliseconds))
-                                      (res (system sync-cmd))
-                                      (dbbackupfile (conc mtdbfile ".backup"))
-                                      (res2 
-                                       (cond
-                                        ((eq? 0 res )
-                                         (handle-exceptions
-                                            exn
-                                            #f
-                                         (if (file-exists? dbbackupfile)
-		                           (delete-file* dbbackupfile)
-                                         )
-                                         (if (eq? 0 (file-size sync-log))
-                                             (delete-file* sync-log))
-		                         (system (conc "/bin/mv " staging-file " " mtdbfile))
-                                         
-                                         (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))
-                                         (set! off-time (calculate-off-time
-                                                         last-sync-seconds
-                                                         (cond
-                                                          ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1))
-                                                           duty-cycle)
-                                                          (else
-                                                           (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid.  Should be a number between 0 and 1, but "duty-cycle" was specified.  Using default value: "default-duty-cycle)
-                                                           default-duty-cycle))))
-                                         
-                                         (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec")
-                                         (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time)
-                                         'sync-completed))
-                                        (else
-                                         (system (conc "/bin/cp "sync-log" "sync-log".fail"))
-                                         (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail")
-                                         (if (file-exists? (conc mtdbfile ".backup"))
-                                             (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile)))
-                                         #f))))
-                                 (common:simple-file-release-lock lockfile)
-                                 (BB> "released lockfile: " lockfile)
-                                 (when (common:file-exists? lockfile)
-                                   (BB> "DID NOT ACTUALLY RELEASE LOCKFILE"))
-                                 res2) ;; end let
-                               );; end begin
-                             ;; else
-                             (cond
-                              (persist-until-sync
-                               (thread-sleep! 1)
-                               (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed.  Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)")
-                               (retry-loop (add1 num-tries)))
-                              (else
-                               (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay)))
-                               (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.")
-                               'parallel-sync-in-progress))
-                             ) ;; end if got lockfile
-                         )
-                    ))
-              (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres)
-              finalres)
-            ) ;; end lambda
-          ))
-    do-a-sync))
+    (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")))
 

Index: subrun.scm
==================================================================
--- subrun.scm
+++ subrun.scm
@@ -15,25 +15,24 @@
 ;; 
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 ;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')
+
+(declare (unit subrun))
+(declare (uses debugprint))
+(declare (uses db))
+(declare (uses common))
+(declare (uses commonmod))
+(declare (uses mt))
 
 (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18) 
      posix-extras directory-utils pathname-expand typed-records format
      call-with-environment-variables)
-(declare (unit subrun))
-;;(declare (uses runs))
-(declare (uses db))
-(declare (uses common))
-;;(declare (uses items))
-;;(declare (uses runconfig))
-;;(declare (uses tests))
-;;(declare (uses server))
-(declare (uses mt))
-;;(declare (uses archive))
-;; (declare (uses filedb))
+
+(import commonmod
+	debugprint)
 
 ;(include "common_records.scm")
 ;;(include "key_records.scm")
 (include "db_records.scm") ;; provides db:test-get-id
 ;;(include "run_records.scm")
@@ -135,11 +134,11 @@
       (subrun:unset-subrun-removed test-run-dir))      
 
   (let* ((log-prefix "run")
          (switches (subrun:selector+log-switches test-run-dir log-prefix))
          (run-wait (equal? run-mode "yes"))
-         (cmd      (conc "megatest " sub-cmd " " switches" "
+         (cmd      (conc (common:get-mtexe)" "sub-cmd" "switches" "
                          (if run-wait "-run-wait " ""))))
     cmd))
 
 
 (define (subrun:sanitize-path inpath)
@@ -232,20 +231,24 @@
                (list (car x) (cdr x)))
              switch-alist))
            " ")))
     res))
 
+;; NOTE: Here we run sub megatest but this is not intended for one version
+;;       of megatest to test another version. Thus we propagate the 
 (define (subrun:exec-sub-megatest test-run-dir action-switches-str log-prefix)
-  (let* ((selector-switches  (subrun:selector+log-switches test-run-dir log-prefix))
-         (cmd (conc "megatest " selector-switches " " action-switches-str ))
+  (let* ((mtpathdir          (common:get-megatest-exe-dir))
+	 (mtexe              (common:get-mtexe))
+	 (selector-switches  (subrun:selector+log-switches test-run-dir log-prefix))
+         (cmd (conc mtexe" "selector-switches" "action-switches-str ))
          (pid #f)
          (proc (lambda ()
                  (debug:print-info 0 *default-log-port* "Running sub megatest command: "cmd)
                  ;;(set! pid (process-run "/usr/bin/xterm" (list ))))))
                  (set! pid (process-run "/bin/bash" (list "-c" cmd))))))
     (call-with-environment-variables 
-     (list (cons "PATH" (conc (get-environment-variable "PATH") ":.")))
+     (list (cons "PATH"  (common:get-megatest-exe-path)))
      (lambda  ()
        (common:without-vars proc "^MT_.*")))
     (let processloop ((i 0))
       (let-values (((pid-val exit-status exit-code)(process-wait pid #t)))
         (if (eq? pid-val 0)

DELETED synchash.scm
Index: synchash.scm
==================================================================
--- synchash.scm
+++ /dev/null
@@ -1,133 +0,0 @@
-;;======================================================================
-;; Copyright 2006-2012, Matthew Welland.
-;; 
-;; This file is part of Megatest.
-;; 
-;;     Megatest is free software: you can redistribute it and/or modify
-;;     it under the terms of the GNU General Public License as published by
-;;     the Free Software Foundation, either version 3 of the License, or
-;;     (at your option) any later version.
-;; 
-;;     Megatest is distributed in the hope that it will be useful,
-;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
-;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;;     GNU General Public License for more details.
-;; 
-;;     You should have received a copy of the GNU General Public License
-;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
-;;
-;;======================================================================
-
-;;======================================================================
-;; A hash of hashes that can be kept in sync by sending minial deltas
-;;======================================================================
-
-(use format)
-(use srfi-1 srfi-69 sqlite3)
-(import (prefix sqlite3 sqlite3:))
-
-(declare (unit synchash))
-(declare (uses db))
-(declare (uses server))
-(include "db_records.scm")
-
-(define (synchash:make)
-   (make-hash-table))
-
-;; given an alist of objects '((id obj) ...) 
-;;   1. remove unchanged objects from the list
-;;   2. create a list of removed objects by id
-;;   3. remove removed objects from synchash
-;;   4. replace or add new or changed objects to synchash
-;;
-(define (synchash:get-delta indat synchash)
-  (let ((deleted '())
-	(changed '())
-	(found   '())
-	(orig-keys (hash-table-keys synchash)))
-    (for-each
-     (lambda (item)
-       (let* ((id  (car  item))
-	      (dat (cadr item))
-	      (ref (hash-table-ref/default synchash id #f)))
-	 (if (not (equal? dat ref)) ;; item changed or new
-	     (begin
-	       (set! changed (cons item changed))
-	       (hash-table-set! synchash id dat)))
-	 (set! found (cons id found))))
-     indat)
-    (for-each 
-     (lambda (id)
-       (if (not (member id found))
-	   (begin
-	     (set! deleted (cons id deleted))
-	     (hash-table-delete! synchash id))))
-     orig-keys)
-    (list changed deleted)
-    ;; (list indat '()) ;; just for debugging
-    ))
-    
-;; keynum => the field to use as the unique key (usually 0 but can be other field)
-;;
-(define (synchash:client-get proc synckey keynum synchash run-id . params)
-  (let* ((data   (rmt:synchash-get run-id proc synckey keynum params))
-	 (newdat (car data))
-	 (removs (cadr data))
-	 (myhash (hash-table-ref/default synchash synckey #f)))
-    (if (not myhash)
-	(begin
-	  (set! myhash (make-hash-table))
-	  (hash-table-set! synchash synckey myhash)))
-    (for-each 
-     (lambda (item)
-       (let ((id  (car item))
-	     (dat (cadr item)))
-	 ;; (debug:print-info 2 *default-log-port* "Processing item: " item)
-	 (hash-table-set! myhash id dat)))
-     newdat)
-    (for-each
-     (lambda (id)
-       (hash-table-delete! myhash id))
-     removs)
-    ;; WHICH ONE!?
-    ;; data)) ;; return the changed and deleted list
-    (list newdat removs))) ;; synchash))
-
-(define *synchashes* (make-hash-table))
-
-(define (synchash:server-get dbstruct run-id proc synckey keynum params)
-  ;; (debug:print-info 2 *default-log-port* "synckey: " synckey ", keynum: " keynum ", params: " params)
-  (let* ((dbdat     (db:get-db dbstruct run-id))
-	 (db        (db:dbdat-get-db dbdat))
-	 (synchash  (hash-table-ref/default *synchashes* synckey #f))
-	 (newdat    (apply (case proc
-			     ((db:get-runs)                   db:get-runs)
-			     ((db:get-tests-for-run-mindata)  db:get-tests-for-run-mindata)
-			     ((db:get-test-info-by-ids)       db:get-test-info-by-ids)
-			     (else
-			      (print "ERROR: sync for hash " proc " not setup! Edits needed in synchash.scm")
-			      print))
-			   db params))
-	 (postdat  #f)
-	 (make-indexed (lambda (x)
-			 (list (vector-ref x keynum) x))))
-    ;; Now process newdat based on the query type
-    (set! postdat (case proc
-		    ((db:get-runs)
-		     ;; (debug:print-info 2 *default-log-port* "Get runs call")
-		     (let ((header (vector-ref newdat 0))
-			   (data   (vector-ref newdat 1)))
-		       ;; (debug:print-info 2 *default-log-port* "header: " header ", data: " data)
-		       (cons (list "header" header)         ;; add the header keyed by the word "header"
-			     (map make-indexed data))))        ;; add each element keyed by the keynum'th val
-		    (else 
-		     ;; (debug:print-info 2 *default-log-port* "Non-get runs call")
-		     (map make-indexed newdat))))
-    ;; (debug:print-info 2 *default-log-port* "postdat: " postdat)
-    ;; (if (not indb)(sqlite3:finalize! db))
-    (if (not synchash)
-	(begin
-	  (set! synchash (make-hash-table))
-	  (hash-table-set! *synchashes* synckey synchash)))
-    (synchash:get-delta postdat synchash)))
-

Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -16,19 +16,30 @@
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 
 ;;  strftime('%m/%d/%Y %H:%M:%S','now','localtime')
 
+(declare (unit tasks))
+(declare (uses debugprint))
+(declare (uses dbfile))
+(declare (uses db))
+(declare (uses dbmod))
+(declare (uses rmt))
+(declare (uses rmtmod))
+(declare (uses common))
+(declare (uses pgdb))
+(declare (uses commonmod))
+(declare (uses mtargs))
+
 (use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking format)
 (import (prefix sqlite3 sqlite3:))
 
-(declare (unit tasks))
-(declare (uses dbfile))
-(declare (uses db))
-(declare (uses rmt))
-(declare (uses common))
-(declare (uses pgdb))
+(import commonmod
+	debugprint
+	dbmod
+	rmtmod
+	(prefix mtargs args:))
 
 (import dbfile)
 ;; (import pgdb) ;; pgdb is a module
 
 (include "task_records.scm")
@@ -36,42 +47,10 @@
 
 ;;======================================================================
 ;; Tasks db
 ;;======================================================================
 
-;; wait up to aprox n seconds for a journal to go away
-;;
-(define (tasks:wait-on-journal path n #!key (remove #f)(waiting-msg #f))
-  (if (not (string? path))
-      (debug:print-error 0 *default-log-port* "Called tasks:wait-on-journal with path=" path " (not a string)")
-      (let ((fullpath (conc path "-journal")))
-	(handle-exceptions
-	 exn
-	 (begin
-	   (print-call-chain (current-error-port))
-	   (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-	   (debug:print 5 *default-log-port* " exn=" (condition->list exn))
-	   (debug:print 0 *default-log-port* "tasks:wait-on-journal failed. Continuing on, you can ignore this call-chain")
-	   #t) ;; if stuff goes wrong just allow it to move on
-	 (let loop ((journal-exists (common:file-exists? fullpath))
-		    (count          n)) ;; wait ten times ...
-	   (if journal-exists
-	       (begin
-		 (if (and waiting-msg
-			  (eq? (modulo n 30) 0))
-		     (debug:print 0 *default-log-port* waiting-msg))
-		 (if (> count 0)
-		     (begin
-		       (thread-sleep! 1)
-		       (loop (common:file-exists? fullpath)
-			     (- count 1)))
-		     (begin
-		       (debug:print 0 *default-log-port* "ERROR: removing the journal file " fullpath ", this is not good. Look for disk full, write access and other issues.")
-		       (if remove (system (conc "rm -rf " fullpath)))
-		       #f)))
-	       #t))))))
-
 (define (tasks:get-task-db-path)
   (let ((dbdir  (or (configf:lookup *configdat* "setup" "monitordir")
 		    (configf:lookup *configdat* "setup" "dbdir")
 		    (conc (common:get-linktree) "/.db"))))
     (handle-exceptions
@@ -1077,13 +1056,10 @@
                (changed      (if (and target run-name)
                             (rmt:get-run-record-ids target run-name (rmt:get-keys) test-patt)
                             (rmt:get-changed-record-ids last-sync-time)))
 	       (run-ids        (alist-ref 'runs       changed))
 	       (test-ids       (alist-ref 'tests      changed))
-	       (test-step-ids  (alist-ref 'test_steps changed))
-	       (test-data-ids  (alist-ref 'test_data  changed))
-	       (run-stat-ids   (alist-ref 'run_stats  changed))
                (area-tag    (if (args:get-arg "-area-tag") 
                                  (args:get-arg "-area-tag")
                                  (if (args:get-arg "-area") 
                                    (args:get-arg "-area") 
                                    ""))))
@@ -1100,13 +1076,10 @@
           (if (not (null? test-ids))
             (begin
               (debug:print-info 0 *default-log-port*  "syncing tests: " test-ids)
 	      (tasks:sync-tests-data dbh cached-info test-ids area-info smallest-last-update-time)
               (debug:print-info 0 *default-log-port*  "syncing test steps")
-              (tasks:sync-test-steps dbh cached-info test-step-ids smallest-last-update-time)
-	      (debug:print-info 0 *default-log-port*  "syncing test data")
-              (tasks:sync-test-gen-data dbh cached-info test-data-ids smallest-last-update-time)
             )
           )
      (let*  ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" (current-seconds))))
      (debug:print-info 0 "smallest-time :" smallest-time  " last-sync-time " last-sync-time)
     (if (not (and target run-name)) 

Index: tcmt.scm
==================================================================
--- tcmt.scm
+++ tcmt.scm
@@ -21,19 +21,25 @@
 ;;
 ;;  1. Run the megatest process and pass it all the needed parameters
 ;;  2. Every five seconds check for state/status changes and print the info
 ;;
 
+(declare (uses mtargs))
+(declare (uses rmt))
+(declare (uses rmtmod))
+(declare (uses common))
+;; (declare (uses megatest-version))
+(declare (uses commonmod))
+
 (use srfi-1 posix srfi-69 srfi-18 regex defstruct)
 
 (use trace)
 ;; (trace-call-sites #t)
 
-(declare (uses margs))
-(declare (uses rmt))
-(declare (uses common))
-;; (declare (uses megatest-version))
+(import commonmod
+	rmtmod
+	(prefix mtargs args:))
 
 (include "megatest-version.scm")
 (include "megatest-fossil-hash.scm")
 (include "db_records.scm")
 

ADDED   tcp-transportmod.scm
Index: tcp-transportmod.scm
==================================================================
--- /dev/null
+++ tcp-transportmod.scm
@@ -0,0 +1,930 @@
+;;======================================================================
+;; Copyright 2017, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+;;======================================================================
+
+(declare (unit tcp-transportmod))
+(declare (uses debugprint))
+(declare (uses commonmod))
+(declare (uses dbfile))
+(declare (uses dbmod))
+(declare (uses portlogger))
+
+(use address-info tcp)
+
+(module tcp-transportmod
+	*
+	
+  (import scheme
+	  (prefix sqlite3 sqlite3:)
+	  chicken
+	  data-structures
+
+	  address-info
+	  directory-utils
+	  extras
+	  files
+	  hostinfo
+	  matchable
+	  md5
+	  message-digest
+	  ports
+	  posix
+	  regex
+	  regex-case
+	  s11n
+	  srfi-1
+	  srfi-18
+	  srfi-4
+	  srfi-69
+	  stack
+	  typed-records
+	  tcp-server
+	  tcp
+	  
+	  debugprint
+	  commonmod
+	  dbfile
+	  dbmod
+	  portlogger
+	)
+
+;;======================================================================
+;; client
+;;======================================================================
+
+;; (define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic
+
+;; Used ONLY for client
+;;
+(defstruct tt-conn
+  host
+  port
+  host-port
+  dbfname
+  server-id
+  server-start
+  servinf-file
+  pid
+)
+
+;; Used for BOTH clients and servers
+(defstruct tt
+  ;; client related
+  (conns (make-hash-table)) ;; dbfname -> conn
+
+  ;; server related
+  (state        'starting)
+  (areapath     #f)
+  (host         #f)
+  (port         #f)
+  (conn         #f)
+  (cleanup-proc #f)
+  (handler      #f) ;; receives data and responds
+  (socket       #f)
+  (thread       #f)
+  (host-port    #f)
+  (cmd-thread   #f)
+  (ro-mode      #f)
+  (ro-mode-checked #f)
+  (last-access  (current-seconds))
+  (servinf-file #f)
+  (last-serv-start 0)
+  )
+
+;; parameters
+;;
+(define tt-server-timeout-param (make-parameter 600))
+
+;; make ttdat visible
+(define *server-info* #f)
+
+(define (tt:make-remote areapath)
+  (make-tt areapath: areapath))
+
+;; 1 ... or #f
+;; and check that dbfname matches. FIXME: the propagation of dbfname and run-id
+;; might not make the best sense
+;;
+(define (tt:valid-run-id run-id dbfname)
+  (and (or (number? run-id)
+	   (not run-id))
+       (equal? (dbfile:run-id->dbfname run-id) dbfname)))
+
+(tcp-buffer-size 2048)
+;; (max-connections 4096) 
+
+;; do all the busy work of finding and setting up conn for
+;; connecting to a server
+;; 
+(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
+  (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
+  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
+	 (server-start-proc (lambda ()
+			      (tt:server-process-run
+			       (tt-areapath ttdat)
+			       testsuite ;; (dbfile:testsuite-name)
+			       (common:find-local-megatest)
+			       run-id))))
+    (if conn
+	(begin 
+          ; (debug:print-info 0 *default-log-port* "already connected to the server")
+           conn) ;; we are already connected to the server
+	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
+	  (match sdat
+	    ((host port start-time server-id pid dbfname2 servinffile)
+	     (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
+             ;(debug:print-info 0 *default-log-port* "in match servinffile:" servinffile)
+	     (let* ((host-port (conc host":"port))
+		    (conn (make-tt-conn
+			   host: host
+			   port: port
+			   host-port: host-port
+			   dbfname: dbfname
+			   servinf-file: servinffile
+			   server-id: server-id
+			   server-start: start-time
+			   pid: pid)))
+	       ;; verify we can talk to this server
+	       (let* ((result   (tt:timed-ping host port server-id))
+		      (ping-res (car result))
+		      (ping     (cdr result)))
+                 (debug:print-info 0 *default-log-port* "ping time: " ping)
+		 (case ping-res
+		   ((running)
+		    (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good?
+		    conn)
+		   ((starting)
+		    (thread-sleep! 0.5)
+		    (tt:client-connect-to-server ttdat dbfname run-id testsuite))
+		   (else
+		    (let* ((curr-secs (current-seconds)))
+		      ;; rm the (last server) would go here
+		      (if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
+			  (begin
+			    (tt-last-serv-start-set! ttdat curr-secs)
+			    (server-start-proc))) ;; start server if 30 sec since last attempt
+		      (thread-sleep! 1)
+		      (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
+	    (else ;; no good server found, if haven't started server in > 5 secs, start another
+	     (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; BUG - grow this number really do not want to swamp the machine with servers
+		 (begin
+		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
+		   (server-start-proc)
+		   (tt-last-serv-start-set! ttdat (current-seconds))))
+	     (thread-sleep! 1)
+	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
+
+(define (tt:timed-ping host port server-id)
+  (let* ((start-time (current-milliseconds))
+	 (result     (tt:ping host port server-id)))
+    (cons result (- (current-milliseconds) start-time))))
+    
+
+(define (tt:ping host port server-id #!optional (tries-left 5))
+  (let*  ((res      (tt:send-receive-direct host port `(ping #f #f #f) ping-mode: #t)) ;; please send me your server-id
+	  (try-again (lambda ()
+		       (if (> tries-left 0)
+			   (begin
+			     (thread-sleep! 1)
+			     (tt:ping host port server-id (- tries-left 1)))
+			   #f))))
+    ;;
+    ;; need two threads, one a 5 second timer
+    ;;
+    (match res
+      ((status errmsg result meta)
+       (if (equal? result server-id)
+	   (let* ((server-state (alist-ref 'sstate meta)))
+	     ;; (debug:print 0 *default-log-port* "Ping to "host":"port" successful.")
+	     (or server-state 'unk)) ;; then we are good
+	   (begin
+	     (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result)
+	     #f)))
+      (else
+       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
+       (try-again)))))
+
+;; client side handler
+;;
+;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
+;;
+(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
+  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
+  (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
+    (if conn
+	;; have connection, call the server
+	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
+	  ;; res is (status errmsg result meta)
+         ; (debug:print 0 *default-log-port* "conn:" conn " res: " res)
+	  (match res
+	    ((status errmsg result meta)
+	     (if (list? meta)
+		 (let* ((delay-wait (alist-ref 'delay-wait meta)))
+		   (if (and (number? delay-wait)
+			    (> delay-wait 0))
+		       (begin
+			 (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
+			 (thread-sleep! delay-wait)))))
+	     (case status
+	       ((busy) ;; result will be how long the server wants you to delay
+		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, will try again in "result" seconds.")
+		(thread-sleep! (if (number? result) result 1))
+		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+	       ((loaded)
+		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
+		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
+		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+	       (else
+		result)))
+	    (else ;; did not receive properly formated result
+	     (if (not res) ;; tt:handler is telling us that communication failed
+		 (let* ((host    (tt-conn-host conn))
+			(port    (tt-conn-port conn))
+			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
+			(pid     (tt-conn-pid  conn))
+                        (servinf (tt-conn-servinf-file conn))) 
+			;;(servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath)
+		   (hash-table-set! (tt-conns ttdat) dbfname #f)
+		   (if (and servinf (file-exists? servinf))
+		       (begin
+			 (if (< attemptnum 3)
+			     (begin
+			       (thread-sleep! 0.25)
+			       (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+			     (begin
+			       (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
+			       (if (and (file-exists? servinf)
+					(> (- (current-seconds)(file-modification-time servinf)) 60))
+				   (begin
+				     (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
+				     (handle-exceptions
+					 exn
+				       #f
+				       (delete-file* servinf))
+				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+				   (begin
+				     ;; start server - addressed in client-connect-to-server
+				     ;; delay        - addressed in client-connect-to-server
+				     ;; try again
+				     (thread-sleep! 0.25) ;; dunno, I think this needs to be here
+				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+				   ))))
+		       (begin ;; no server file, delay and try again
+			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
+			 (thread-sleep! 0.5)
+			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
+		 (begin ;; this case is where res is malformed. Probably should abort
+		   (assert #f "FATAL: tt:handler received bad data "res)
+		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
+		   ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)
+		   )))))
+	(begin
+	  (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again
+	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))
+
+(define (tt:bid-for-servership run-id)
+  #f)
+
+;; gets server info and appends path to server file
+;; sorts by age, oldest first
+;;
+;; returns list of (host port startseconds server-id servinfofile)
+;;
+(define (tt:get-server-info-sorted ttdat dbfname)
+  (let* ((areapath (tt-areapath ttdat))
+	 (sfiles   (tt:find-server areapath dbfname))
+	 (sdats    (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read
+	 (sorted   (sort sdats (lambda (a b)
+				 (let* ((starta (list-ref a 2))
+					(startb (list-ref b 2)))
+				   (if (eq? starta startb)
+				       (string>? (list-ref a 3)(list-ref b 3)) ;; if servers started at same time look at server-id
+				       (< starta startb))))))
+	 (count    0))
+    (for-each
+     (lambda (rec)
+       (if (or (> (length sorted) 1)
+	       (common:low-noise-print 120 "server info sorted"))
+	   (debug:print 0 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", ")))
+       (set! count (+ count 1)))
+     sorted)
+    sorted))
+    
+(define (tt:get-current-server-info ttdat dbfname)
+  (assert (tt-areapath ttdat) "FATAL: areapath not set in ttdat.")
+  ;;
+  ;; TODO - replace most of below with tt;get-server-info-sorted
+  ;;
+  (let* ((areapath (tt-areapath ttdat))
+	 (sfiles   (tt:find-server areapath dbfname))
+	 (sdats    (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read
+	 (sorted   (sort sdats (lambda (a b)
+				 (< (list-ref a 2)(list-ref b 2))))))
+    (if (null? sorted)
+	#f  ;; we'll want to wait until extra servers have exited
+	(car sorted))))
+
+(define (tt:send-receive ttdat conn cmd run-id params)
+  (let* ((host-port (tt-conn-host-port conn)) ;; (conc (tt-conn-host conn)":"(tt-conn-port conn)))
+	 (host      (tt-conn-host conn))
+	 (port      (tt-conn-port conn))
+	 (dat       (list cmd run-id params #f))) ;; no meta data yet
+    (tt:send-receive-direct host port dat)))
+
+(defstruct tt:backoff
+  (last-ioerr (current-seconds))
+  (last-adj-t (current-seconds))
+  (wait-delay 0.1))
+
+(define *tt:backoff-smoothing* (make-hash-table)) ;; host:port => lastaccess backoffdelay )
+
+(define (tt:backoff-incr host port) ;; call if tcp fails i/o net
+  (let* ((host-port (conc host":"port))
+	 (bkoff     (hash-table-ref/default *tt:backoff-smoothing* host-port #f)))
+    (if bkoff
+	(begin
+	  (tt:backoff-last-ioerr-set! bkoff (current-seconds))
+	  (tt:backoff-wait-delay-set! bkoff (+ (tt:backoff-wait-delay bkoff) 0.1)))
+	(hash-table-set! *tt:backoff-smoothing* host-port (make-tt:backoff)))))
+
+(define (tt:backoff-decr-and-wait host port)
+  (let* ((host-port (conc host":"port))
+	 (bkoff     (hash-table-ref/default *tt:backoff-smoothing* host-port #f)))
+    (if bkoff
+	(let* ((wait-delay (tt:backoff-wait-delay bkoff))
+	       (last-ioerr (tt:backoff-last-ioerr bkoff))
+	       (last-adj-t (tt:backoff-last-adj-t bkoff))
+	       (delta      (- (current-seconds) last-adj-t))
+	       (adj        (* delta 0.001)) ;; it takes 100 seconds to recover from hitting an io err
+	       (new-wait   (if (> wait-delay 0)
+			       (if (> adj wait-delay)
+				   0
+				   (- wait-delay adj))
+			       0)))
+	  (if (> new-wait 0)
+	      (begin
+		(if (common:low-noise-print 10 "delay wait message")
+		    (debug:print-info 0 *default-log-port* "Server loaded, DelayWait: "new-wait))
+		(tt:backoff-wait-delay-set! bkoff new-wait)
+		(tt:backoff-last-adj-t-set! bkoff (current-seconds))
+		(thread-sleep! new-wait))
+	      (hash-table-delete! *tt:backoff-smoothing* host-port))))))
+
+(define (tt:send-receive-direct host port dat #!key (ping-mode #f)(tries-remaining 25))
+  (assert (number? port) "FATAL: tt:send-receive-direct called with port not a number "port)
+  (tt:backoff-decr-and-wait host port)
+  (let* ((retry          (lambda ()
+			   (tt:send-receive-direct host port dat tries-remaining: (- tries-remaining 1))))
+	 (full-err-print (lambda (exn msg)
+			   (if (condition? exn)
+			       (begin
+				 (pp (condition->list exn) *default-log-port*)
+				 (pp dat *default-log-port*)
+				 (debug:print 0 *default-log-port* msg
+					      ", error: "     ((condition-property-accessor 'exn 'message)   exn)
+					      ", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
+					      ", location: "  ((condition-property-accessor 'exn 'location)  exn)
+					      ))
+			       (debug:print 0 *default-log-port* msg "(note: exn="exn", is not a condition object.")))))
+    (condition-case
+     (let-values (((inp oup)(tcp-connect host port)))
+       (let ((res (if (and inp oup)
+		      (begin
+			(serialize dat oup)
+			(close-output-port oup)
+			(deserialize inp))
+		      )))
+	 (close-input-port inp)
+	 (match res
+	   ((result exn-result stdout-result)
+	    (if exn-result
+		(full-err-print exn-result "ERROR: Server side exception detected"))
+	    (if stdout-result
+		(debug:print 0 *default-log-port* "ERROR: Output detected on stdout on server side execution => "stdout-result))
+	    result)
+	   (else
+	    (debug:print 0 *default-log-port* "ERROR: server returned non-standard output: "res)
+	    #f))))
+     (exn (io-error)
+	  (full-err-print exn  "ERROR: i/o error")
+	  (tt:backoff-incr host port)
+	  #f)
+     (exn (i/o net)
+	  (if ping-mode
+	      #f
+	      (cond
+	       ((>  tries-remaining 4) ;; server likely defunct
+		(tt:backoff-incr host port)
+		#f)
+	       ((>= tries-remaining 0)
+		(let* ((backoff-delay (max (* (- 26 tries-remaining) 0.1) 1.0)))
+		  (debug:print 0 *default-log-port* "WARNING: TCP overload, trying again in "backoff-delay"s.")
+		  (thread-sleep! backoff-delay)
+		  (tt:backoff-incr host port)
+		  (retry))
+		;; (assert #f "FATAL: Too many retries in tt:send-receive-direct")
+		)
+	       (else #f))))
+     (exn ()
+	  (full-err-print exn "Unhandled exception from client side.")
+	  #f))))
+
+
+;;======================================================================
+;; server
+;;======================================================================
+
+(define (tt:sync-dbs ttdat)
+  #f)
+
+;; start the listener and start responding to requests
+;;
+;; NOTE: organise by dbfname, not run-id so we don't need
+;;       to pull in more modules
+;;
+;; This is the routine called in megatest.scm to start a server
+;;
+;; Server viability is checked in keep-running. Blindly start and run here.
+;;
+(define (tt:start-server areapath run-id dbfname-in handler keys)
+  (assert areapath "FATAL: areapath not provided for tt:start-server")
+  ;; is there already a server for this dbfile? Then exit.
+  (let* ((ttdat   (make-tt areapath: areapath))
+	 (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
+	 (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
+    (if (> (length servers) 4)
+	(begin
+	  (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
+	  (exit))
+	(let* ((dbstruct   (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
+	  (tt-handler-set! ttdat (handler dbstruct))
+	  (let* ((tcp-thread (make-thread
+			      (lambda ()
+				(tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
+			      "tcp-server-thread"))
+		 (run-thread (make-thread
+			      (lambda ()
+				(tt:keep-running ttdat dbfname dbstruct)))))
+	    (thread-start! tcp-thread)
+	    (thread-start! run-thread)
+	    (thread-join! run-thread) ;; run thread will exit on timeout or other conditions
+	    (exit))))))
+
+(define (tt:keep-running ttdat dbfname dbstruct)
+  ;; verfiy conn for ready
+  ;; listener socket has been started by this stage
+  ;; wait for a port before creating the registration file
+  ;;
+  (let* ((db-locked-in #f)
+	 (areapath     (tt-areapath ttdat))
+	 (nosyncdbpath (conc areapath"/.mtdb"))
+	 (cleanup (lambda ()
+		    (if (tt-cleanup-proc ttdat)
+			((tt-cleanup-proc ttdat)))
+		    (dbfile:with-no-sync-db nosyncdbpath
+					    (lambda (db)
+					      (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
+						(debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
+						(db:no-sync-del! db dbfname)
+						#;(if dbtmpname
+						    (delete-file dbtmpname))))))))
+    (set! *server-info* ttdat)
+    (let loop ((count 0))
+      (if (> count 240)
+	  (begin
+	    (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
+	    (exit 1))
+	  (if (not (tt-port ttdat)) ;; no connection yet
+	      (begin
+		(thread-sleep! 0.25)
+		(loop (+ count 1))))))
+    
+    (tt:create-server-registration-file ttdat dbfname)
+    ;; now start watching the last-access, if it hasn't been touched
+    ;; in over ten seconds we exit
+    (thread-sleep! 0.05) ;; any real need for delay here?
+    (let loop ()
+      (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
+	     (ok      (cond
+		       ((null? servers) #f) ;; not ok
+		       ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
+				(tt-servinf-file ttdat))
+			(let* ((res (if db-locked-in
+					#t
+					(let* ((lock-result  ;; this is the primary lock - need to double verify that got it
+						(dbfile:with-no-sync-db
+						 nosyncdbpath
+						 (lambda (db)
+						   (db:no-sync-lock-and-check db dbfname
+									      (tt-servinf-file ttdat)
+									      ;; (dbr:dbstruct-dbtmpname dbstruct)
+									      ))))
+					       (success (car lock-result)))
+					  (if success
+					      (begin
+						(tt-state-set! ttdat 'running)
+						(debug:print 0 *default-log-port* "Got server lock for " dbfname)
+						(set! db-locked-in #t)
+						#t)
+					      (begin
+						(debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
+						#f))))))
+			  (if (and res (common:low-noise-print 120 "top server message"))
+			      (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
+						dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
+			  res))
+		       (else
+			(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
+			(let* ((leadsrv (car servers)))
+			  (match leadsrv
+			    ((host port startseconds server-id pid dbfname servinfofile)
+			     (let* ((result  (tt:timed-ping host port server-id))
+				    (res     (car result))
+				    (ping    (cdr result)))
+			       (debug:print-info 0 *default-log-port* "Ping to "host":"port", with server-id "server-id
+						 ", and file "servinfofile" returned "res)
+			       (if res
+				   #f ;; not the server, but all good, want to exit
+				   (if (and (file-exists? servinfofile)
+					  (> (- (current-seconds)(file-modification-time servinfofile)) 30))
+				     (begin
+				       ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it
+				       (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
+                                       (handle-exceptions
+                                        exn
+                                        (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile)
+				        (delete-file* servinfofile)
+                                       )
+				       #t) ;; not the server but the server is not reachable
+				     (begin
+				       (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.")
+				       (thread-sleep! 1) ;; just because
+				       #t)))))
+			    (else ;; should never get here
+			     (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
+			     (assert #f "Bad server record "leadsrv))))))))
+	(if ok
+	    (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
+	    (begin
+	      (debug:print 0 *default-log-port* "Exiting immediately")
+	      (cleanup)
+	      (exit)))
+
+	(let* ((last-update (dbr:dbstruct-last-update dbstruct))
+	       (curr-secs   (current-seconds)))
+	  (if (and (eq? (tt-state ttdat) 'running)
+		   (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db?
+	      (begin
+		(set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds))
+		((dbr:dbstruct-sync-proc dbstruct) last-update)
+		(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
+	  
+	(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
+	    (begin
+	      (thread-sleep! 5)
+	      (loop)))))
+    (cleanup)
+    (debug:print 0 *default-log-port* "INFO: Server timed out, exiting.")))
+
+  
+;; ;; given an already set up uconn start the cmd-loop
+;; ;;
+;; (define (tt:cmd-loop ttdat)
+;;   (let* ((serv-listener (-socket uconn))
+;; 	 (listener      (lambda ()
+;; 			  (let loop ((state 'start))
+;; 			    (let-values (((inp oup)(tcp-accept serv-listener)))
+;; 			      ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+;; 			      (let* ((rdat  (deserialize inp)) ;; '(my-host-port qrykey cmd params)
+;; 				     (resp  (ulex-handler uconn rdat)))
+;; 				(serialize resp oup)
+;; 				(close-input-port inp)
+;; 				(close-output-port oup)
+;; 				;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+;; 				)
+;; 			      (loop state))))))
+;;     ;; start N of them
+;;     (let loop ((thnum   0)
+;; 	       (threads '()))
+;;       (if (< thnum 100)
+;; 	  (let* ((th (make-thread listener (conc "listener" thnum))))
+;; 	    (thread-start! th)
+;; 	    (loop (+ thnum 1)
+;; 		  (cons th threads)))
+;; 	  (map thread-join! threads)))))
+;; 
+;; 
+;; 
+;; (define (wait-and-close uconn)
+;;   (thread-join! (udat-cmd-thread uconn))
+;;   (tcp-close (udat-socket uconn)))
+;; 
+;; 
+
+(define (tt:shutdown-server ttdat)
+  (let* ((cleanproc (tt-cleanup-proc ttdat))
+	 (port      (tt-port         ttdat)))
+    (tt-state-set! ttdat 'shutdown)
+    (portlogger:open-run-close portlogger:set-port port "released")
+    (if cleanproc (cleanproc))
+    (tcp-close (tt-socket ttdat)) ;; close up ports here
+    ))
+
+;; (define (wait-and-close uconn)
+;;   (thread-join! (tt-cmd-thread uconn))
+;;   (tcp-close (tt-socket uconn)))
+
+;; return servid
+;; side-effects:
+;;   ttdat-cleanup-proc is populated with function to remove the serverinfo file
+(define (tt:create-server-registration-file ttdat dbfname)
+  (let* ((areapath (tt-areapath ttdat))
+	 (servdir  (tt:get-servinfo-dir areapath))
+	 (host     (tt-host ttdat))
+	 (port     (tt-port ttdat))
+	 (servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname))
+	 (serv-id (tt:mk-signature areapath))
+	 (clean-proc (lambda ()
+		       (delete-file* servinf)
+		       )))
+    (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname)
+    (tt-cleanup-proc-set! ttdat clean-proc)
+    (tt-servinf-file-set! ttdat servinf)
+    (with-output-to-file servinf
+      (lambda ()
+	(print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname)))
+      serv-id))
+
+;; find valid server
+;; get servers listed, last part of name must match :<dbfname>
+;; if more than one, wait one second and look again
+;; future: ping oldest, if alive remove other :<dbfname> files
+;;
+(define (tt:find-server areapath dbfname)
+  (let* ((servdir  (tt:get-servinfo-dir areapath))
+	 (sfiles   (glob (conc servdir"/*:"dbfname))))
+    sfiles))
+
+;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
+;; example of what it's looking for in the log file:
+;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
+;;
+(define (tt:server-get-info logf)
+  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
+        (dbprep-rx    (regexp "^SERVER: dbprep"))
+        (dbprep-found 0)
+	(bad-dat      (list #f #f #f #f #f #f logf)))
+     (let ((fdat     (handle-exceptions
+			 exn
+		       (begin
+			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
+			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))
+			 '()) ;; no idea what went wrong, call it a bad server, return empty list
+		       (with-input-from-file logf read-lines))))
+       (if (null? fdat) ;; bad data, return bad-dat
+	   bad-dat
+	   (let loop ((inl  (car fdat))
+		      (tail (cdr fdat))
+		      (lnum 0))
+	     (let ((mlst (string-match server-rx inl))
+		   (dbprep (string-match dbprep-rx inl)))
+	       (if dbprep (set! dbprep-found 1))
+	       (if (not mlst)
+		   (if (> lnum 500) ;; give up if more than 500 lines of server log read
+		       bad-dat
+		       (if (null? tail)
+			   bad-dat
+			   (loop (car tail)(cdr tail)(+ lnum 1))))
+		   (match mlst ;; have a not null list
+		     ((_ host port start server-id pid dbfname)
+		      (list host
+			    (string->number port)
+			    (string->number start)
+			    server-id
+			    (string->number pid)
+			    dbfname
+			    logf))
+		     (else
+		      (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
+		      bad-dat)))))))))
+
+;; Given an area path,  start a server process    ### NOTE ### > file 2>&1 
+;; if the target-host is set 
+;; try running on that host
+;;   incidental: rotate logs in logs/ dir.
+;;
+(define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
+  (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
+  (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
+  (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
+  ;; mtest -server - -m testsuite:ext-tests -db 6.db
+  (let* ((dbfname  (dbmod:run-id->dbfname run-id))
+	 (load     (get-normalized-cpu-load))
+	 (trying   (length (tt:find-server areapath dbfname)))
+	 (nrun     (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
+    (cond
+     ((> load 2.0)
+      (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.")
+      (thread-sleep! 1))
+     ((> nrun 100)
+      (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
+      (thread-sleep! 1))
+     ((> trying 4)
+      (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
+      (thread-sleep! 1))
+     (else
+      (if (not (file-exists? (conc areapath"/logs")))
+	      (create-directory (conc areapath"/logs") #t))
+	  (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
+		 (cmdln     (conc
+			     mtexe
+			     " -server - ";; (or target-host "-")
+			     " -m testsuite:" testsuite
+			     ;; " -run-id " (or run-id "main") ;; NO, we do NOT want to have run id as part of this
+			     " -db "  dbfname ;; (dbmod:run-id->dbfname run-id)
+			     " " profile-mode
+			     ))) ;; (conc " >> " logfile " 2>&1 &")))))
+	    ;; we want the remote server to start in *toppath* so push there
+	    ;; (push-directory areapath) ;; use cd in the command line instead
+	    (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
+	    ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
+	    (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
+	    (setenv "NBFAKE_LOG" logfile)
+	    (system (conc "cd "areapath" ; nbfake " cmdln))
+	    (unsetenv "NBFAKE_QUIET")
+	    (unsetenv "NBFAKE_LOG")
+	    ;;(pop-directory)
+	    )))))
+
+;;======================================================================
+;; tcp connection stuff
+;;======================================================================
+
+;; find a port and start tcp-server. This only starts the tcp portion of
+;; the server, look at (tt:start-server ...) above for the entry point
+;; for the entire server system
+;;
+(define (tt:start-tcp-server ttdat)
+  (setup-listener-portlogger ttdat) ;; set up tcp-listener
+  (let* ((socket   (tt-socket  ttdat))
+	 (handler  (tt-handler ttdat)) ;; the handler comes from our client setting a handler function
+	 (handler-proc (lambda ()
+			 (let* ((indat         (deserialize))
+				(result        #f)
+				(exn-result    #f)
+				(stdout-result (with-output-to-string
+						 (lambda ()
+						   (let ((res (handle-exceptions
+								  exn
+								(let* ((errdat (condition->list exn)))
+								  (set! exn-result errdat)
+								  (debug:print 0 *default-log-port* "ERROR: handler exception, these are bad, will exit in five seconds.")
+								  (pp errdat *default-log-port*)
+								  ;; these are always bad, set up an exit thread
+								  (thread-start! (make-thread (lambda ()
+												(thread-sleep! 5)
+												(exit))))
+								  #f)
+								(handler indat) ;; this is the proc being called by the remote client
+								)))
+						     (set! result res)))))
+				(full-result (list result exn-result (if (equal? stdout-result "") #f stdout-result))))
+			   (handle-exceptions
+			       exn
+			     (begin
+			       (debug:print 0 *default-log-port* "Serialization failure. full-result="full-result)
+			       ;; (serialize '(#f #f #f)) ;; doesn't work - the first call to serialize caused failure
+			       )
+			     (serialize full-result))))))
+    ((make-tcp-server socket handler-proc)
+     #f ;; yes, send error messages to std-err
+     )))
+
+;; create a tcp listener and return a populated udat struct with
+;; my port, address, hostname, pid etc.
+;; return #f if fail to find a port to allocate.
+;;
+;;  if udata-in is #f create the record
+;;  if there is already a serv-listener return the udata
+;;
+;; (define (setup-listener uconn #!optional (port 4242))
+;;   (assert (tt? uconn) "FATAL: setup-listener called with wrong struct "uconn)
+;;   (handle-exceptions
+;;    exn
+;;    (if (< port 65535)
+;;        (begin
+;; 	 (thread-sleep! 0.25)
+;; 	 (setup-listener uconn (+ port 1)))
+;;        #f)
+;;    (connect-listener uconn port)))
+
+(define (setup-listener-portlogger uconn)
+  (let ((port (portlogger:open-run-close portlogger:find-port)))
+    (assert (tt? uconn) "FATAL: setup-listener called with wrong struct "uconn)
+    (handle-exceptions
+	exn
+      (if (< port 65535)
+	  (begin
+	    (portlogger:open-run-close portlogger:set-failed port)
+	    (thread-sleep! 0.25)
+	    (setup-listener-portlogger uconn))
+	  #f)
+      (connect-listener uconn port))))
+
+(define (connect-listener uconn port)
+  ;; (tcp-listener-socket LISTENER)(socket-name so)
+  ;; sockaddr-address, sockaddr-port, sockaddr->string
+  (let* ((tlsn (tcp-listen port 10000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
+	 (addr  (tt:get-best-guess-address (get-host-name)))) ;; (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))
+    (tt-port-set!      uconn port)
+    (tt-host-set!      uconn addr)
+    (tt-host-port-set! uconn (conc addr":"port))
+    (tt-socket-set!    uconn tlsn)
+    uconn))
+
+;;======================================================================
+;; utils
+;;======================================================================
+
+;; Generate a unique signature for this server
+(define (tt:mk-signature areapath)
+  (message-digest-string (md5-primitive) 
+			 (with-output-to-string
+			   (lambda ()
+			     (write (list areapath
+                                          (current-process-id)
+					  (argv)))))))
+
+
+(define (tt:get-best-guess-address hostname)
+  (let ((res #f))
+    (for-each 
+     (lambda (adr)
+       (if (not (eq? (u8vector-ref adr 0) 127))
+	   (set! res adr)))
+     ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME
+     (vector->list (hostinfo-addresses (hostname->hostinfo hostname))))
+    (string-intersperse 
+     (map number->string
+	  (u8vector->list
+	   (if res res (hostname->ip hostname)))) ".")))
+
+(define (tt:get-servinfo-dir areapath)
+  (let* ((spath (conc areapath"/.servinfo")))
+    (if (not (file-exists? spath))
+	(create-directory spath #t))
+    spath))
+
+;;======================================================================
+;; network utilities
+;;======================================================================
+
+;; NOTE: Look at address-info egg as alternative to some of this
+
+(define (rate-ip ipaddr)
+  (regex-case ipaddr
+    ( "^127\\..*" _ 0 )
+    ( "^(10\\.0|192\\.168)\\..*" _ 1 )
+    ( else 2 ) ))
+
+;; Change this to bias for addresses with a reasonable broadcast value?
+;;
+(define (ip-pref-less? a b)
+  (> (rate-ip a) (rate-ip b)))
+
+(define (get-my-best-address)
+  (let ((all-my-addresses (get-all-ips)))
+    (cond
+     ((null? all-my-addresses)
+      (get-host-name))                                          ;; no interfaces?
+     ((eq? (length all-my-addresses) 1)
+      (car all-my-addresses))                      ;; only one to choose from, just go with it
+     (else
+      (car (sort all-my-addresses ip-pref-less?))))))
+
+(define (get-all-ips-sorted)
+  (sort (get-all-ips) ip-pref-less?))
+
+(define (get-all-ips)
+  (map address-info-host
+       (filter (lambda (x)
+		 (equal? (address-info-type x) "tcp"))
+	       (address-infos (get-host-name)))))
+
+)

Index: tdb.scm
==================================================================
--- tdb.scm
+++ tdb.scm
@@ -20,22 +20,30 @@
 
 ;;======================================================================
 ;; Database access
 ;;======================================================================
 
+(declare (unit tdb))
+(declare (uses debugprint))
+(declare (uses common))
+(declare (uses keys))
+(declare (uses ods))
+(declare (uses mt))
+(declare (uses db))
+(declare (uses commonmod))
+(declare (uses mtargs))
+(declare (uses rmtmod))
+
 (require-extension (srfi 18) extras tcp)
 (use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64)
 (import (prefix sqlite3 sqlite3:))
 (import (prefix base64 base64:))
 
-(declare (unit tdb))
-(declare (uses common))
-(declare (uses keys))
-(declare (uses ods))
-(declare (uses client))
-(declare (uses mt))
-(declare (uses db))
+(import commonmod
+	debugprint
+	rmtmod
+	(prefix mtargs args:))
 
 (include "common_records.scm")
 (include "db_records.scm")
 (include "key_records.scm")
 (include "run_records.scm")

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -21,25 +21,24 @@
 ;;======================================================================
 ;; Tests
 ;;======================================================================
 
 (declare (unit tests))
-(declare (uses lock-queue))
 (declare (uses db))
 (declare (uses tdb))
+(declare (uses debugprint))
 (declare (uses common))
 (declare (uses commonmod))
-;; (declare (uses dcommon)) ;; needed for the steps processing
 (declare (uses items))
 (declare (uses runconfig))
-;; (declare (uses sdb))
 (declare (uses server))
-;;(declare (uses stml2))
+(declare (uses mtargs))
+(declare (uses rmtmod))
 
-(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking tcp directory-utils)
+(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking directory-utils)
 (import (prefix sqlite3 sqlite3:))
-(import commonmod)
+(import commonmod (prefix mtargs args:) debugprint rmtmod)
 (require-library stml)
 
 (include "common_records.scm")
 (include "key_records.scm")
 (include "db_records.scm")
@@ -1966,13 +1965,13 @@
 ;;======================================================================
 
 ;; teststep-set-status! used to be here
 
 (define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
-  (let* ((testdat   (rmt:get-test-info-by-id run-id test-id)))
+  (let* ((testdat   (rmt:get-test-state-status-by-id run-id test-id)))
     (and testdat
-	 (equal? (test:get-state testdat) "KILLREQ"))))
+	 (equal? (car testdat) "KILLREQ"))))
 
 (define (test:tdb-get-rundat-count tdb)
   (if tdb
       (let ((res 0))
 	(sqlite3:for-each-row

ADDED   transport-mode.scm.template
Index: transport-mode.scm.template
==================================================================
--- /dev/null
+++ transport-mode.scm.template
@@ -0,0 +1,22 @@
+;;======================================================================
+;; set up transport, db cache and sync methods
+;;
+;; sync-method:        'original, 'attach or 'none
+;; cache-method:       'tmp 'none
+;; rmt:transport-mode: 'http, 'tcp, 'nfs
+;;
+;; NOTE: NOT ALL COMBINATIONS WORK
+;;
+;;======================================================================
+
+;; uncomment this block to test without tcp
+;; (dbfile:sync-method 'none)
+;; (dbfile:cache-method 'none)
+;; (rmt:transport-mode 'nfs)
+
+;; uncomment this block to test with tcp
+(dbfile:sync-method 'original) ;; attach) ;; original
+(dbfile:cache-method 'tmp)
+(rmt:transport-mode 'tcp)
+
+

Index: tree.scm
==================================================================
--- tree.scm
+++ tree.scm
@@ -16,27 +16,29 @@
 ;;     You should have received a copy of the GNU General Public License
 ;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 ;;
 ;;======================================================================
 
+(declare (unit tree))
+(declare (uses mtargs))
+(declare (uses debugprint))
+(declare (uses launch))
+(declare (uses gutils))
+(declare (uses db))
+(declare (uses server))
+(declare (uses dcommon))
+
 (use format)
 (require-library iup)
 (import (prefix iup iup:))
 (use canvas-draw)
 
 (use sqlite3 srfi-1 posix regex regex-case srfi-69)
 (import (prefix sqlite3 sqlite3:))
 
-(declare (unit tree))
-(declare (uses margs))
-(declare (uses launch))
-;; (declare (uses megatest-version))
-(declare (uses gutils))
-(declare (uses db))
-(declare (uses server))
-;; (declare (uses synchash))
-(declare (uses dcommon))
+(import (prefix mtargs args:)
+	debugprint)
 
 (include "megatest-version.scm")
 (include "common_records.scm")
 (include "db_records.scm")
 (include "key_records.scm")

DELETED ulex.scm
Index: ulex.scm
==================================================================
--- ulex.scm
+++ /dev/null
@@ -1,24 +0,0 @@
-;;======================================================================
-;; Copyright 2019, Matthew Welland.
-;; 
-;; This file is part of Megatest.
-;; 
-;;     Megatest is free software: you can redistribute it and/or modify
-;;     it under the terms of the GNU General Public License as published by
-;;     the Free Software Foundation, either version 3 of the License, or
-;;     (at your option) any later version.
-;; 
-;;     Megatest is distributed in the hope that it will be useful,
-;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
-;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;;     GNU General Public License for more details.
-;; 
-;;     You should have received a copy of the GNU General Public License
-;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
-
-;;======================================================================
-
-(declare (unit ulex))
-(declare (uses pkts))
-
-(include "ulex/ulex.scm")

ADDED   ulex/dbmgr.scm
Index: ulex/dbmgr.scm
==================================================================
--- /dev/null
+++ ulex/dbmgr.scm
@@ -0,0 +1,1131 @@
+;;======================================================================
+;; Copyright 2022, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+;;======================================================================
+
+(declare (unit dbmgrmod))
+
+(declare (uses ulex))
+(declare (uses apimod))
+(declare (uses pkts))
+(declare (uses commonmod))
+(declare (uses dbmod))
+(declare (uses mtargs))
+(declare (uses portloggermod))
+(declare (uses debugprint))
+
+(module dbmgrmod
+    *
+
+(import scheme
+	chicken.base
+	chicken.condition
+	chicken.file
+	chicken.format
+	chicken.port
+	chicken.process
+	chicken.process-context
+	chicken.process-context.posix
+	chicken.sort
+	chicken.string
+	chicken.time
+	
+	(prefix sqlite3 sqlite3:)
+	matchable
+	md5
+	message-digest
+	regex
+	s11n
+	srfi-1
+	srfi-18
+	srfi-69
+	system-information
+	typed-records
+	
+	pkts
+	ulex
+
+	commonmod
+	apimod
+	dbmod
+	debugprint
+	(prefix mtargs args:)
+	portloggermod
+	)
+
+;; Configurations for server
+;; (tcp-buffer-size 2048)
+;; (max-connections 2048) 
+
+;; info about me as a listener and my connections to db servers
+;; stored (for now) in *db-serv-info*
+;;
+(defstruct servdat
+  (host #f)
+  (port #f)
+  (uuid #f)
+  (dbfile #f)
+  (uconn   #f) ;; this is the listener *FOR THIS PROCESS*
+  (mode    #f)
+  (status 'starting)
+  (trynum 0) ;; count the number of ports we've tried
+  (conns  (make-hash-table)) ;; apath/dbname => conndat
+  ) 
+
+(define *db-serv-info* (make-servdat))
+
+(define (servdat->url sdat)
+  (conc (servdat-host sdat)":"(servdat-port sdat)))
+
+;; db servers contact info
+;;
+(defstruct conndat
+  (apath    #f)
+  (dbname   #f)
+  (fullname #f)
+  (hostport #f)
+  (ipaddr   #f)
+  (port     #f)
+  (srvpkt   #f)
+  (srvkey   #f)
+  (lastmsg  0)
+  (expires  0))
+
+(define *srvpktspec*
+  `((server (host    . h)
+	    (port    . p)
+	    (servkey . k)
+	    (pid     . i)
+	    (ipaddr  . a)
+	    (dbpath  . d))))
+
+;;======================================================================
+;;  S U P P O R T   F U N C T I O N S
+;;======================================================================
+
+;; set up the api proc, seems like there should be a better place for this?
+;;
+;; IS THIS NEEDED ANYMORE? TODO - REMOVE IF POSSIBLE
+;;
+;; (define api-proc (make-parameter conc))
+;; (api-proc api:execute-requests)
+
+;; do we have a connection to apath dbname and
+;; is it not expired? then return it
+;;
+;; else setup a connection
+;;
+;; if that fails, return '(#f "some reason") ;; NB// convert to raising an exception
+;;
+(define (rmt:get-conn remdat apath dbname)
+  (let* ((fullname (db:dbname->path apath dbname)))
+    (hash-table-ref/default (servdat-conns remdat) fullname #f)))
+
+(define (rmt:drop-conn remdat apath dbname)
+  (let* ((fullname (db:dbname->path apath dbname)))
+    (hash-table-delete! (servdat-conns remdat) fullname)))
+  
+(define (rmt:find-main-server uconn apath dbname)
+  (let* ((pktsdir     (get-pkts-dir apath))
+	 (all-srvpkts (get-all-server-pkts pktsdir *srvpktspec*))
+	 (viable-srvs (get-viable-servers all-srvpkts dbname)))
+    (get-the-server uconn apath viable-srvs)))
+
+
+(define *connstart-mutex* (make-mutex))
+(define *last-main-start* 0)
+
+;; looks for a connection to main, returns if have and not exired
+;; creates new otherwise
+;; 
+;; connections for other servers happens by requesting from main
+;;
+;; TODO: This is unnecessarily re-creating the record in the hash table
+;;
+(define (rmt:open-main-connection remdat apath)
+  (let* ((fullpath (db:dbname->path apath ".db/main.db"))
+	 (conns    (servdat-conns remdat))
+	 (conn     (rmt:get-conn remdat apath ".db/main.db")) ;; (hash-table-ref/default conns fullpath #f)) ;; TODO - create call for this
+	 (start-rmt:run (lambda ()
+			  (let* ((th1 (make-thread (lambda ()(rmt:run (get-host-name))) "non-db mode server")))
+			    (thread-start! th1)
+			    (thread-sleep! 1)
+			    (let loop ((count 0))
+			      (assert (< count 30) "FATAL: responder failed to initialize in rmt:open-main-connection")
+			      (if (or (not *db-serv-info*)
+				      (not (servdat-uconn *db-serv-info*)))
+				  (begin
+				    (thread-sleep! 1)
+				    (loop (+ count 1)))
+				  (begin
+				    (servdat-mode-set! *db-serv-info* 'non-db)
+				    (servdat-uconn *db-serv-info*)))))))
+	 (myconn    (servdat-uconn *db-serv-info*)))
+    (cond
+     ((not myconn)
+      (start-rmt:run)
+      (rmt:open-main-connection remdat apath))
+     ((and conn                                             ;; conn is NOT a socket, just saying ...
+	   (< (current-seconds) (conndat-expires conn)))
+      #t) ;; we are current and good to go - we'll deal elsewhere with a server that was killed or died 
+     ((and conn
+	   (>= (current-seconds)(conndat-expires conn)))
+      (debug:print-info 0 *default-log-port* "connection to "fullpath" server expired. Reconnecting.")
+      (rmt:drop-conn remdat apath ".db/main.db") ;;
+      (rmt:open-main-connection remdat apath))
+     (else
+      ;; Below we will find or create and connect to main
+      (debug:print-info 0 *default-log-port* "rmt:open-main-connection - starting from scratch")
+      (let* ((dbname         (db:run-id->dbname #f))
+	     (the-srv        (rmt:find-main-server myconn apath dbname))
+	     (start-main-srv (lambda () ;; call IF there is no the-srv found
+			       (mutex-lock! *connstart-mutex*)
+			       (if (> (- (current-seconds) *last-main-start*) 5) ;; at least four seconds since last attempt to start main server
+				   (begin
+				     (api:run-server-process apath dbname)
+				     (set! *last-main-start* (current-seconds))
+				     (thread-sleep! 1))
+				   (thread-sleep! 0.25))
+			       (mutex-unlock! *connstart-mutex*)
+			       (rmt:open-main-connection remdat apath) ;; TODO: Add limit to number of tries
+			       )))
+	(if (not the-srv) ;; have server, try connecting to it
+	    (start-main-srv)
+	    (let* ((srv-addr (server-address the-srv)) ;; need serv
+		   (ipaddr   (alist-ref 'ipaddr  the-srv))
+		   (port     (alist-ref 'port    the-srv))
+		   (srvkey   (alist-ref 'servkey the-srv))
+		   (fullpath (db:dbname->path apath dbname))
+		   
+		   (new-the-srv (make-conndat
+				 apath:   apath
+				 dbname:  dbname
+				 fullname: fullpath
+				 hostport: srv-addr
+				 ;; socket: (open-nn-connection srv-addr)  - TODO - open ulex connection?
+				 ipaddr: ipaddr
+				 port: port
+				 srvpkt: the-srv
+				 srvkey: srvkey ;; generated by rmt:get-signature on the server side
+				 lastmsg: (current-seconds)
+				 expires: (+ (current-seconds)
+					     (server:expiration-timeout)
+					     -2) ;; this needs to be gathered during the ping
+				 )))
+	      (hash-table-set! conns fullpath new-the-srv)))
+	#t)))))
+
+;; NB// sinfo is a servdat struct
+;;
+(define (rmt:general-open-connection sinfo apath dbname #!key (num-tries 5))
+  (assert (not (equal? dbname ".db/main.db")) "ERROR: general-open-connection should never be called with main as the db")
+  (let* ((mdbname  ".db/main.db") ;; (db:run-id->dbname #f)) TODO: put this back to the lookup when stable
+	 (fullname (db:dbname->path apath dbname))
+	 (conns    (servdat-conns sinfo))
+	 (mconn    (rmt:get-conn sinfo apath ".db/main.db"))
+	 (dconn    (rmt:get-conn sinfo apath dbname)))
+    #;(if (and mconn
+	     (not (debug:print-logger)))
+	(begin
+	  (debug:print-info 0 *default-log-port* "Turning on logging to main, look in logs dir for main log.")
+	  (debug:print-logger rmt:log-to-main)))
+    (cond
+     ((and mconn
+	   dconn
+	   (< (current-seconds)(conndat-expires dconn)))
+      #t) ;; good to go
+     ((not mconn) ;; no channel open to main? open it...
+      (rmt:open-main-connection sinfo apath)
+      (rmt:general-open-connection sinfo apath dbname num-tries: (- num-tries 1)))
+     ((not dconn)                 ;; no channel open to dbname?     
+      (let* ((res (rmt:send-receive-real sinfo apath mdbname 'get-server `(,apath ,dbname))))
+	(case res
+	  ((server-started)
+	   (if (> num-tries 0)
+	       (begin
+		 (thread-sleep! 2)
+		 (rmt:general-open-connection sinfo apath dbname num-tries: (- num-tries 1)))
+	       (begin
+		 (debug:print-error 0 *default-log-port* "Failed to start servers needed or open channel to "apath", "dbname)
+		 (exit 1))))
+	  (else
+	   (if (list? res) ;; server has been registered and the info was returned. pass it on.
+	       (begin ;;  ("192.168.0.9" 53817
+		      ;;  "5e34239f48e8973b3813221e54701a01" "24310"
+		      ;;  "192.168.0.9"
+		      ;;  "/home/matt/data/megatest/tests/simplerun"
+		 ;;  ".db/1.db")
+		 (match
+		  res
+		  ((host port servkey pid ipaddr apath dbname)
+		   (debug:print-info 0 *default-log-port* "got "res)
+		   (hash-table-set! conns
+				    fullname
+				    (make-conndat
+				     apath: apath
+				     dbname: dbname
+				     hostport: (conc host":"port)
+				     ;; socket: (open-nn-connection (conc host":"port)) ;; TODO - open ulex connection?
+				     ipaddr: ipaddr
+				     port: port
+				     srvkey: servkey
+				     lastmsg: (current-seconds)
+				     expires: (+ (current-seconds)
+						 (server:expiration-timeout)
+						 -2))))
+		  (else
+		   (debug:print-info 0 *default-log-port* "return data from starting server did not match host port servkey pid ipaddr apath dbname " res)))
+		 res)
+	       (begin
+		 (debug:print-info 0 *default-log-port* "Unexpected result: " res)
+		 res)))))))
+    #t))
+
+;;======================================================================
+
+;; FOR DEBUGGING SET TO #t
+;; (define *localmode* #t)
+(define *localmode* #f)
+(define *dbstruct* (make-dbr:dbstruct))
+
+;; Defaults to current area
+;;
+(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f))
+  (let* ((apath      *toppath*)
+	 (sinfo      *db-serv-info*)
+	 (dbname     (db:run-id->dbname rid)))
+    (if *localmode*
+	(api:execute-requests *dbstruct* cmd params)
+	(begin
+	  (rmt:open-main-connection sinfo apath)
+	  (if rid (rmt:general-open-connection sinfo apath dbname))
+	  #;(if (not (member cmd '(log-to-main)))
+	      (debug:print-info 0 *default-log-port* "rmt:send-receive "cmd" params="params))
+	  (rmt:send-receive-real sinfo apath dbname cmd params)))))
+
+;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed
+;; sometime in the future
+;;
+(define (rmt:send-receive-real sinfo apath dbname cmd params)
+  (assert (not (eq? 'primordial (thread-name (current-thread)))) "FATAL: Do not call rmt:send-receive-real in the primodial thread.")
+  (let* ((cdat (rmt:get-conn sinfo apath dbname)))
+    (assert cdat "FATAL: rmt:send-receive-real called without the needed channels opened")
+    (let* ((uconn    (servdat-uconn sinfo)) ;; get the interface to ulex
+	   ;; then send-receive using the ulex layer to host-port stored in cdat
+	   (res      (send-receive uconn (conndat-hostport cdat) cmd params))
+	   #;(th1      (make-thread (lambda ()
+				    (set! res (send-receive uconn (conndat-hostport cdat) cmd params)))
+				  "send-receive thread")))
+      ;; (thread-start! th1)
+      ;; (thread-join! th1)   ;; gratuitious thread stuff is so that mailbox is not used in primordial thead
+      ;; since we accessed the server we can bump the expires time up
+      (conndat-expires-set! cdat (+ (current-seconds)
+				    (server:expiration-timeout)
+				    -10)) ;; ten second margin for network time misalignments etc.
+      res)))
+
+;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed
+;; sometime in the future.
+;;
+;; Purpose - call the main.db server and request a server be started
+;; for the given area path and dbname
+;;
+
+(define (rmt:print-db-stats)
+  (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
+    (debug:print 18 *default-log-port* "DB Stats, "(seconds->year-week/day-time (current-seconds))"\n=====================")
+    (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
+    (for-each (lambda (cmd)
+		(let ((cmd-dat (hash-table-ref *db-stats* cmd)))
+		  (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0))))))
+	      (sort (hash-table-keys *db-stats*)
+		    (lambda (a b)
+		      (> (vector-ref (hash-table-ref *db-stats* a) 0)
+			 (vector-ref (hash-table-ref *db-stats* b) 0)))))))
+
+(define (rmt:get-max-query-average run-id)
+  (mutex-lock! *db-stats-mutex*)
+  (let* ((runkey (conc "run-id=" run-id " "))
+	 (cmds   (filter (lambda (x)
+			   (substring-index runkey x))
+			 (hash-table-keys *db-stats*)))
+	 (res    (if (null? cmds)
+		     (cons 'none 0)
+		     (let loop ((cmd (car cmds))
+				(tal (cdr cmds))
+				(max-cmd (car cmds))
+				(res 0))
+		       (let* ((cmd-dat (hash-table-ref *db-stats* cmd))
+			      (tot     (vector-ref cmd-dat 0))
+			      (curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction
+			      (currmax (max res curravg))
+			      (newmax-cmd (if (> curravg res) cmd max-cmd)))
+			 (if (null? tal)
+			     (if (> tot 10)
+				 (cons newmax-cmd currmax)
+				 (cons 'none 0))
+			     (loop (car tal)(cdr tal) newmax-cmd currmax)))))))
+    (mutex-unlock! *db-stats-mutex*)
+    res))
+
+;; host and port are used to ensure we are remove proper records
+(define (rmt:server-shutdown host port)
+  (let ((dbfile   (servdat-dbfile *db-serv-info*)))
+    (debug:print-info 0 *default-log-port* "dbfile is "dbfile)
+    (if dbfile
+	(let* ((am-server  (args:get-arg "-server"))
+	       (dbfile     (args:get-arg "-db"))
+	       (apath      *toppath*)
+	       #;(sinfo     *remotedat*)) ;; foundation for future fix
+	  (if *dbstruct-db*
+	      (let* ((dbdat      (db:get-dbdat *dbstruct-db* apath dbfile))
+		     (db         (dbr:dbdat-db dbdat))
+		     (inmem      (dbr:dbdat-db dbdat))   ;; WRONG
+		     )
+		;; do a final sync here
+		(debug:print-info 0 *default-log-port* "Doing final sync for "apath" "dbfile" at "(current-seconds))
+		(db:sync-inmem->disk *dbstruct-db* apath dbfile force-sync: #t)
+		;; let's finalize here
+		(debug:print-info 0 *default-log-port* "Finalizing db and inmem")
+		(if (sqlite3:database? db)
+		    (sqlite3:finalize! db)
+		    (debug:print-info 0 *default-log-port* "in rmt:server-shutdown, db is not a database, not finalizing..."))
+		(if (sqlite3:database? inmem)
+		    (sqlite3:finalize! inmem)
+		    (debug:print-info 0 *default-log-port* "in rmt:server-shutdown, inmem is not a database, not finalizing..."))
+		(debug:print-info 0 *default-log-port* "Finalizing db and inmem complete"))
+	      (debug:print-info 0 *default-log-port* "Db was never opened, no cleanup to do."))
+	  (if (not am-server)
+	      (debug:print-info 0 *default-log-port* "I am not a server, should NOT get here!")
+	      (if (string-match ".*/main.db$" dbfile)
+		  (let ((pkt-file (conc (get-pkts-dir *toppath*)
+					"/" (servdat-uuid *db-serv-info*)
+					".pkt")))
+		    (debug:print-info 0 *default-log-port* "removing pkt "pkt-file)
+		    (delete-file* pkt-file)
+		    (debug:print-info 0 *default-log-port* "Releasing lock (if any) for "dbfile ", host "host", port "port)
+		    (db:with-lock-db
+		     (servdat-dbfile *db-serv-info*)
+		     (lambda (dbh dbfile)
+		       (db:release-lock dbh dbfile host port)))) ;; I'm not the server - should not have a lock to remove
+		  (let* ((sdat *db-serv-info*) ;; we have a run-id server
+			 (host (servdat-host sdat))
+			 (port (servdat-port sdat))
+			 (uuid (servdat-uuid sdat))
+			 (res  (rmt:deregister-server *db-serv-info* *toppath* host port uuid dbfile)))
+		    (debug:print-info 0 *default-log-port* "deregistered-server, res="res)
+		    (debug:print-info 0 *default-log-port* "deregistering server "host":"port" with uuid "uuid)
+		    )))))))
+
+
+(define (common:run-sync?)
+    ;; (and (common:on-homehost?)
+  (args:get-arg "-server"))
+
+(define *rmt:run-mutex* (make-mutex))
+(define *rmt:run-flag* #f)
+
+;; Main entry point to start a server. was start-server
+(define (rmt:run hostn)
+  (mutex-lock! *rmt:run-mutex*)
+  (if *rmt:run-flag*
+      (begin
+	(debug:print-warn 0 *default-log-port* "rmt:run already running.")
+	(mutex-unlock! *rmt:run-mutex*))
+      (begin
+	(set! *rmt:run-flag* #t)
+	(mutex-unlock! *rmt:run-mutex*)
+	;;  ;; Configurations for server
+	;;  (tcp-buffer-size 2048)
+	;;  (max-connections 2048) 
+	(debug:print 2 *default-log-port* "PID: "(current-process-id)". Attempting to start the server ...")
+	(if (and *db-serv-info*
+		 (servdat-uconn *db-serv-info*))
+	    (let* ((uconn (servdat-uconn *db-serv-info*)))
+	      (wait-and-close uconn))
+	    (let* ((port            (portlogger:open-run-close portlogger:find-port))
+		   (handler-proc    (lambda (rem-host-port qrykey cmd params) ;;
+				      (set! *db-last-access* (current-seconds))
+				      (assert (list? params) "FATAL: handler called with non-list params")
+				      (assert (args:get-arg "-server") "FATAL: handler called on non-server side. cmd="cmd", params="params)
+				      (debug:print 0 *default-log-port* "handler call: "cmd", params="params)
+				      (api:execute-requests *dbstruct-db* cmd params))))
+	      ;; (api:process-request *dbstuct-db* 
+	      (if (not *db-serv-info*)
+		  (set! *db-serv-info* (make-servdat host: hostn port: port)))
+	      (let* ((uconn (run-listener handler-proc port))
+		     (rport (udat-port uconn))) ;; the real port
+		(servdat-host-set! *db-serv-info* hostn)
+		(servdat-port-set! *db-serv-info* rport)
+		(servdat-uconn-set! *db-serv-info* uconn)
+		(wait-and-close uconn)
+		(db:print-current-query-stats)
+		)))
+	(let* ((host (servdat-host *db-serv-info*))
+	       (port (servdat-port *db-serv-info*))
+	       (mode (or (servdat-mode *db-serv-info*)
+			 "non-db")))
+	  ;; server exit stuff here
+	  ;; (rmt:server-shutdown host port) - always do in on-exit
+	  ;; (portlogger:open-run-close portlogger:set-port port "released") ;; moved to on-exit 
+	  (debug:print-info 0 *default-log-port* "Server "host":"port" mode "mode"shutdown complete. Exiting")
+	  ))))
+
+;;======================================================================
+;; S E R V E R   U T I L I T I E S 
+;;======================================================================
+
+
+;;======================================================================
+;; NEW SERVER METHOD
+;;======================================================================
+
+;; only use for main.db - need to re-write some of this :(
+;;
+(define (get-lock-db sdat dbfile host port)
+  (assert host "FATAL: get-lock-db called with host not set.")
+  (assert port "FATAL: get-lock-db called with port not set.")
+  (let* ((dbh (db:open-run-db dbfile db:initialize-db)) ;; open-run-db creates a standard db with schema used by all situations
+	 (res (db:get-iam-server-lock dbh dbfile host port))
+	 (uconn (servdat-uconn sdat)))
+    ;; res => list then already locked, check server is responsive
+    ;;     => #t then sucessfully got the lock
+    ;;     => #f reserved for future use as to indicate something went wrong
+    (match res
+      ((owner_pid owner_host owner_port event_time)
+       (if (server-ready? uconn (conc owner_host":"owner_port) "abc")
+	   #f      ;; locked by someone else
+	   (begin  ;; locked by someone dead and gone
+	     (debug:print 0 *default-log-port* "WARNING: stale lock - have to steal it. This may fail.")
+	     (db:steal-lock-db dbh dbfile port))))
+      (#t  #t) ;; placeholder so that we don't touch res if it is #t
+      (else (set! res #f)))
+    (sqlite3:finalize! dbh)
+    res))
+
+
+(define (register-server pkts-dir pkt-spec host port servkey ipaddr dbpath)
+  (let* ((pkt-dat `((host    . ,host)
+		    (port    . ,port)
+		    (servkey . ,servkey)
+		    (pid     . ,(current-process-id))
+		    (ipaddr  . ,ipaddr)
+		    (dbpath  . ,dbpath)))
+	 (uuid    (write-alist->pkt
+		   pkts-dir
+		   pkt-dat
+		   pktspec: pkt-spec
+		   ptype: 'server)))
+    (debug:print 0 *default-log-port* "Server on "host":"port" registered in pkt "uuid)
+    uuid))
+
+(define (get-pkts-dir #!optional (apath #f))
+  (let* ((effective-toppath (or *toppath* apath)))
+    (assert effective-toppath
+	    "ERROR: get-pkts-dir called without *toppath* set. Exiting.")
+    (let* ((pdir (conc effective-toppath "/.meta/srvpkts")))
+      (if (file-exists? pdir)
+	  pdir
+	  (begin
+	    (handle-exceptions ;; this exception handler should NOT be needed but ...
+		exn
+		pdir
+	      (create-directory pdir #t))
+	    pdir)))))
+
+;; given a pkts dir read 
+;;
+(define (get-all-server-pkts pktsdir-in pktspec)
+  (let* ((pktsdir  (if (file-exists? pktsdir-in)
+		       pktsdir-in
+		       (begin
+			 (create-directory pktsdir-in #t)
+			 pktsdir-in)))
+	 (all-pkt-files (glob (conc pktsdir "/*.pkt"))))
+    (map (lambda (pkt-file)
+	   (read-pkt->alist pkt-file pktspec: pktspec))
+	 all-pkt-files)))
+
+(define (server-address srv-pkt)
+  (conc (alist-ref 'host srv-pkt) ":"
+	(alist-ref 'port srv-pkt)))
+	
+(define (server-ready? uconn host-port key) ;; server-address is host:port
+  (let* ((params `((cmd . ping)(key . ,key)))
+	 (data `((cmd . ping)
+		 (key . ,key)
+		 (params . ,params))) ;; I don't get it.
+	 (res  (send-receive uconn host-port 'ping data)))
+    (if (eq? res 'ack) ;; yep, likely it is who we want on the other end
+	res
+	#f)))
+;; (begin (debug:print-info 0 *default-log-port* "server-ready? => "res) #f))))
+
+; from the pkts return servers associated with dbpath
+;; NOTE: Only one can be alive - have to check on each
+;;       in the list of pkts returned
+;;
+(define (get-viable-servers serv-pkts dbpath)
+  (let loop ((tail serv-pkts)
+	     (res  '()))
+    (if (null? tail)
+	res ;; NOTE: sort by age so oldest is considered first
+	(let* ((spkt (car tail)))
+	  (loop (cdr tail)
+		(if (equal? dbpath (alist-ref 'dbpath spkt))
+		    (cons spkt res)
+		    res))))))
+
+(define (remove-pkts-if-not-alive uconn serv-pkts)
+  (filter (lambda (pkt)
+	    (let* ((host (alist-ref 'host pkt))
+		   (port (alist-ref 'port pkt))
+		   (host-port (conc host":"port))
+		   (key  (alist-ref 'servkey  pkt))
+		   (pktz (alist-ref 'Z        pkt))
+		   (res  (server-ready? uconn host-port key)))
+	      (if res
+		  res
+		  (let* ((pktsdir (get-pkts-dir *toppath*))
+			 (pktpath (conc pktsdir"/"pktz".pkt")))
+		    (debug:print 0 *default-log-port* "WARNING: pkt with no server "pktpath)
+		    (delete-file* pktpath)
+		    #f))))
+	  serv-pkts))
+
+;; from viable servers get one that is alive and ready
+;;
+(define (get-the-server uconn apath serv-pkts)
+  (let loop ((tail serv-pkts))
+    (if (null? tail)
+	#f
+	(let* ((spkt  (car tail))
+	       (host  (alist-ref 'ipaddr spkt))
+	       (port  (alist-ref 'port spkt))
+	       (host-port (conc host":"port))
+	       (dbpth (alist-ref 'dbpath spkt))
+	       (srvkey (alist-ref 'Z spkt)) ;; (alist-ref 'srvkey spkt))
+	       (addr  (server-address spkt)))
+	  (if (server-ready? uconn host-port srvkey)
+	      spkt
+	      (loop (cdr tail)))))))
+
+;; am I the "first" in line server? I.e. my D card is smallest
+;; use Z card as tie breaker
+;;
+(define (get-best-candidate serv-pkts dbpath)
+  (if (null? serv-pkts)
+      #f
+      (let loop ((tail serv-pkts)
+		 (best  (car serv-pkts)))
+	(if (null? tail)
+	    best
+	    (let* ((candidate (car tail))
+		   (candidate-bd (string->number (alist-ref 'D candidate)))
+		   (best-bd      (string->number (alist-ref 'D best)))
+		   ;; bigger number is younger
+		   (candidate-z  (alist-ref 'Z candidate))
+		   (best-z       (alist-ref 'Z best))
+		   (new-best     (cond
+				  ((> best-bd candidate-bd) ;; best is younger than candidate
+				   candidate)
+				  ((< best-bd candidate-bd) ;; candidate is younger than best
+				   best)
+				  (else
+				   (if (string>=? best-z candidate-z)
+				       best
+				       candidate))))) ;; use Z card as tie breaker
+	      (if (null? tail)
+		  new-best
+		  (loop (cdr tail) new-best)))))))
+	  
+
+;;======================================================================
+;; END NEW SERVER METHOD
+;;======================================================================
+
+;; if .db/main.db check the pkts
+;; 
+(define (rmt:wait-for-server pkts-dir db-file server-key)
+  (let* ((sdat *db-serv-info*))
+    (let loop ((start-time (current-seconds))
+	       (changed    #t)
+	       (last-sdat  "not this"))
+      (begin ;; let ((sdat #f))
+	(thread-sleep! 0.01)
+	(debug:print-info 0 *default-log-port* "Waiting for server alive signature")
+	(mutex-lock! *heartbeat-mutex*)
+	(set! sdat *db-serv-info*)
+	(mutex-unlock! *heartbeat-mutex*)
+	(if (and sdat
+		 (not changed)
+		 (> (- (current-seconds) start-time) 2))
+	    (let* ((uconn (servdat-uconn sdat)))
+	      (servdat-status-set! sdat 'iface-stable)
+	      (debug:print-info 0 *default-log-port* "Received server alive signature, now attempting to lock in server")
+	      ;; create a server pkt in *toppath*/.meta/srvpkts
+	      
+	      ;; TODO:
+	      ;;   1. change sdat to stuct
+	      ;;   2. add uuid to struct
+	      ;;   3. update uuid in sdat here
+	      ;;
+	      (servdat-uuid-set! sdat
+				 (register-server
+				  pkts-dir *srvpktspec*
+				  (get-host-name)
+				  (servdat-port sdat) server-key
+				  (servdat-host sdat) db-file))
+	      ;; (set! *my-signature* (servdat-uuid sdat)) ;; replace with Z, no, stick with proper key
+	      ;; now read pkts and see if we are a contender
+	      (let* ((all-pkts     (get-all-server-pkts pkts-dir *srvpktspec*))
+		     (viables      (get-viable-servers all-pkts db-file))
+		     (alive        (remove-pkts-if-not-alive uconn viables))
+		     (best-srv     (get-best-candidate alive db-file))
+		     (best-srv-key (if best-srv (alist-ref 'servkey best-srv) #f))
+		     (i-am-srv     (equal? best-srv-key server-key))
+		     (delete-pkt   (lambda ()
+				     (let* ((pktfile (conc (get-pkts-dir *toppath*)
+							 "/" (servdat-uuid *db-serv-info*)
+							 ".pkt")))
+				       (debug:print-info 0 *default-log-port* "Attempting to remove bogus pkt file "pktfile)
+				       (delete-file* pktfile))))) ;; remove immediately instead of waiting for on-exit
+		(debug:print 0 *default-log-port* "best-srv-key: "best-srv-key", server-key: "server-key", i-am-srv: "i-am-srv)
+		;; am I the best-srv, compare server-keys to know
+		(if i-am-srv
+		    (if (get-lock-db sdat db-file (servdat-host sdat)(servdat-port sdat)) ;; (db:get-iam-server-lock *dbstruct-db* *toppath* run-id)
+			(begin
+			  (debug:print-info 0 *default-log-port* "I'm the server!")
+			  (servdat-dbfile-set! sdat db-file)
+			  (servdat-status-set! sdat 'db-locked))
+			(begin
+			  (debug:print-info 0 *default-log-port* "I'm not the server, exiting.")
+			  (bdat-time-to-exit-set! *bdat* #t)
+			  (delete-pkt)
+			  (thread-sleep! 0.2)
+			  (exit)))
+		    (begin
+		      (debug:print-info 0 *default-log-port*
+				   "Keys do not match "best-srv-key", "server-key", exiting.")
+		      (bdat-time-to-exit-set! *bdat* #t)
+		      (delete-pkt)
+		      (thread-sleep! 0.2)
+		      (exit)))
+		sdat))
+	    (begin ;; sdat not yet contains server info
+	      (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
+	      (sleep 4)
+	      (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
+		  (begin
+		    (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
+		    (exit))
+		  (loop start-time
+			(equal? sdat last-sdat)
+			sdat))))))))
+
+(define (rmt:register-server sinfo apath iface port server-key dbname)
+  (servdat-conns sinfo) ;; just checking types
+  (rmt:open-main-connection sinfo apath) ;; we need a channel to main.db
+  (rmt:send-receive-real sinfo apath      ;; params: host port servkey pid ipaddr dbpath
+			 (db:run-id->dbname #f)
+			 'register-server `(,iface
+					    ,port
+					    ,server-key
+					    ,(current-process-id)
+					    ,iface
+					    ,apath
+					    ,dbname)))
+
+(define (rmt:get-count-servers sinfo apath)
+  (servdat-conns sinfo) ;; just checking types
+  (rmt:open-main-connection sinfo apath) ;; we need a channel to main.db
+  (rmt:send-receive-real sinfo apath      ;; params: host port servkey pid ipaddr dbpath
+			 (db:run-id->dbname #f)
+			 'get-count-servers `(,apath)))
+
+(define (rmt:get-servers-info apath)
+  (rmt:send-receive 'get-servers-info #f `(,apath)))
+
+(define (rmt:deregister-server db-serv-info apath iface port server-key dbname)
+  (rmt:open-main-connection db-serv-info apath) ;; we need a channel to main.db
+  (rmt:send-receive-real db-serv-info apath      ;; params: host port servkey pid ipaddr dbpath
+                         (db:run-id->dbname #f)
+                         'deregister-server `(,iface
+                                              ,port
+                                              ,server-key
+                                              ,(current-process-id)
+                                              ,iface
+                                              ,apath
+                                              ,dbname)))
+
+(define (rmt:wait-for-stable-interface #!optional (num-tries-allowed 100))
+  ;; wait until *db-serv-info* stops changing
+  (let* ((stime (current-seconds)))
+    (let loop ((last-host  #f)
+	       (last-port  #f)
+	       (tries 0))
+      (let* ((curr-host (and *db-serv-info* (servdat-host *db-serv-info*)))
+	     (curr-port (and *db-serv-info* (servdat-port *db-serv-info*))))
+	;; first we verify port and interface, update *db-serv-info* in need be.
+	(cond
+	 ((> tries num-tries-allowed)
+	  (debug:print 0 *default-log-port* "rmt:keep-running, giving up after trying for several minutes.")
+	  (exit 1))
+	 ((not *db-serv-info*)
+	  (thread-sleep! 0.25)
+	  (loop curr-host curr-port (+ tries 1)))
+	 ((or (not last-host)(not last-port))
+	  (debug:print 0 *default-log-port* "rmt:keep-running, still no interface, tries="tries)
+	  (thread-sleep! 0.25)
+	  (loop curr-host curr-port (+ tries 1)))
+	 ((or (not (equal? last-host curr-host))
+	      (not (equal? last-port curr-port)))
+	  (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info")
+	  (thread-sleep! 0.25)
+	  (loop curr-host curr-port (+ tries 1)))
+	 ((< (- (current-seconds) stime) 1) ;; keep up the looping until at least 3 seconds have passed
+	  (thread-sleep! 0.5)
+	  (loop curr-host curr-port (+ tries 1)))
+	 (else
+	  (rmt:get-signature) ;; sets *my-signature* as side effect
+	  (servdat-status-set! *db-serv-info* 'interface-stable)
+	  (debug:print 0 *default-log-port*
+		       "SERVER STARTED: " curr-host
+		       ":" curr-port
+		       " AT " (current-seconds) " server signature: " *my-signature*
+		       " with "(servdat-trynum *db-serv-info*)" port changes")
+	  (flush-output *default-log-port*)
+	  #t))))))
+
+;; run rmt:keep-running in a parallel thread to monitor that the db is being 
+;; used and to shutdown after sometime if it is not.
+;;
+(define (rmt:keep-running dbname) 
+  ;; if none running or if > 20 seconds since 
+  ;; server last used then start shutdown
+  ;; This thread waits for the server to come alive
+  (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server")
+
+  (let* ((sinfo             *db-serv-info*)
+	 (server-start-time (current-seconds))
+	 (pkts-dir          (get-pkts-dir))
+	 (server-key        (rmt:get-signature)) ;; This servers key
+	 (is-main           (equal? (args:get-arg "-db") ".db/main.db"))
+	 (last-access       0)
+	 (server-timeout    (server:expiration-timeout))
+	 (shutdown-server-sequence (lambda (host port)
+				     (set! *unclean-shutdown* #f) ;; Should not be needed anymore
+				     (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
+				     ;; (rmt:server-shutdown host port) -- called in on-exit
+				     ;; (portlogger:open-run-close portlogger:set-port port "released") called in on-exit
+				     (exit)))
+	 (timed-out?        (lambda ()
+			      (<= (+ last-access server-timeout)
+				  (current-seconds)))))
+    (servdat-dbfile-set! *db-serv-info* (args:get-arg "-db"))
+    ;; main and run db servers have both got wait logic (could/should merge it)
+    (if is-main
+	(rmt:wait-for-server pkts-dir dbname server-key)
+	(rmt:wait-for-stable-interface))
+    ;; this is our forever loop
+    (let* ((iface (servdat-host *db-serv-info*))
+	   (port  (servdat-port *db-serv-info*))
+	   (uconn (servdat-uconn *db-serv-info*)))
+      (let loop ((count          0)
+		 (bad-sync-count 0)
+		 (start-time     (current-milliseconds)))
+	(if (and (not is-main)
+		 (common:low-noise-print 60 "servdat-status"))
+	    (debug:print-info 0 *default-log-port* "servdat-status is " (servdat-status *db-serv-info*)))
+
+	(mutex-lock! *heartbeat-mutex*)
+	;; set up the database handle
+	(if (not *dbstruct-db*) ;; no db opened yet, open the db and register with main if appropriate
+	      (let ((watchdog (bdat-watchdog *bdat*)))
+		(debug:print 0 *default-log-port* "SERVER: dbprep")
+		(db:setup dbname) ;; sets *dbstruct-db* as side effect
+		(servdat-status-set! *db-serv-info* 'db-opened)
+		;; IFF I'm not main, call into main and register self
+		(if (not is-main)
+		    (let ((res (rmt:register-server sinfo
+						    *toppath* iface port
+						    server-key dbname)))
+		      (if res ;; we are the server
+			  (servdat-status-set! *db-serv-info* 'have-interface-and-db)
+			  ;; now check that the db locker is alive, clear it out if not
+			  (let* ((serv-info (rmt:server-info *toppath* dbname)))
+			    (match serv-info
+				   ((host port servkey pid ipaddr apath dbpath)
+				    (if (not (server-ready? uconn (conc host":"port) servkey))
+					(begin
+					  (debug:print-info 0 *default-log-port* "Server registered but not alive. Removing and trying again.")
+					  (rmt:deregister-server sinfo apath host port servkey dbpath) ;; servkey pid ipaddr apath dbpath)
+					  (loop (+ count 1) bad-sync-count start-time))))
+				   (else
+				    (debug:print 0 *default-log-port* "We are not the server for "dbname", exiting. Server info is: "serv-info)
+				    (exit)))))))
+		(debug:print 0 *default-log-port*
+			     "SERVER: running, db "dbname" opened, megatest version: "
+			   (common:get-full-version))
+	      ;; start the watchdog
+
+	      ;; is this really needed?
+	      
+	      #;(if watchdog
+		  (if (not (member (thread-state watchdog)
+				   '(ready running blocked
+					   sleeping dead)))
+		      (begin
+			(debug:print-info 0 *default-log-port* "Starting watchdog thread (in state "(thread-state watchdog)")")
+			(thread-start! watchdog))
+		      (debug:print-info 0 *default-log-port* "Not starting watchdog thread (in state "(thread-state watchdog)")"))
+		  (debug:print 0 *default-log-port* "ERROR: *watchdog* not setup, cannot start it."))
+	      #;(loop (+ count 1) bad-sync-count start-time)
+	      ))
+	
+	(db:sync-inmem->disk *dbstruct-db* *toppath* dbname force-sync: #t)
+	
+	(mutex-unlock! *heartbeat-mutex*)
+	
+	;; when things go wrong we don't want to be doing the various
+	;; queries too often so we strive to run this stuff only every
+	;; four seconds or so.
+	(let* ((sync-time (- (current-milliseconds) start-time))
+	       (rem-time  (quotient (- 4000 sync-time) 1000)))
+	  (if (and (<= rem-time 4)
+		   (>  rem-time 0))
+	      (thread-sleep! rem-time)))
+    
+	;; Transfer *db-last-access* to last-access to use in checking that we are still alive
+	(set! last-access *db-last-access*)
+	
+	(if (< count 1) ;; 3x3 = 9 secs aprox
+	    (loop (+ count 1) bad-sync-count (current-milliseconds)))
+	
+	(if (common:low-noise-print 60 "dbstats")
+	    (begin
+	      (debug:print 0 *default-log-port* "Server stats:")
+	      (db:print-current-query-stats)))
+	(let* ((hrs-since-start  (/ (- (current-seconds) server-start-time) 3600)))
+	  (cond
+	   ((not *server-run*)
+	    (debug:print-info 0 *default-log-port* "*server-run* set to #f. Shutting down.")
+	    (shutdown-server-sequence (get-host-name) port))
+	   ((timed-out?)
+	    (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
+	    (shutdown-server-sequence (get-host-name) port))
+	   ((and *server-run*
+		 (or (not (timed-out?))
+		     (if is-main ;; do not exit if there are other servers (keep main open until all others gone)
+			 (> (rmt:get-count-servers sinfo *toppath*) 1)
+			 #f)))
+	    (if (common:low-noise-print 120 "server continuing")
+		(debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)))
+	    (loop 0 bad-sync-count (current-milliseconds)))
+	   (else
+	    (set! *unclean-shutdown* #f)
+	    (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
+	    (shutdown-server-sequence (get-host-name) port)
+	    #;(debug:print-info 0 *default-log-port* "Sending 'quit to server, received: "
+			      (open-send-receive-nn (conc iface":"port)      ;; do this here and not in server-shutdown
+						    (sexpr->string 'quit))))))))))
+
+(define (rmt:get-reasonable-hostname)
+  (let* ((inhost (or (args:get-arg "-server") "-")))
+    (if (equal? inhost "-")
+	(get-host-name)
+	inhost)))
+
+;; Call this to start the actual server
+;;
+;; all routes though here end in exit ...
+;;
+;; This is the point at which servers are started
+;;
+(define (rmt:server-launch dbname)
+  (debug:print-info 0 *default-log-port* "Entered rmt:server-launch")
+  (let* ((th2 (make-thread (lambda ()
+			     (debug:print-info 0 *default-log-port* "Server run thread started")
+			     (rmt:run (rmt:get-reasonable-hostname)))
+			   "Server run"))
+	 (th3 (make-thread (lambda ()
+			     (debug:print-info 0 *default-log-port* "Server monitor thread started")
+			     (if (args:get-arg "-server")
+				 (rmt:keep-running dbname)))
+			     "Keep running")))
+    (thread-start! th2)
+    (thread-sleep! 0.252) ;; give the server time to settle before starting the keep-running monitor.
+    (thread-start! th3)
+    (set! *didsomething* #t)
+    (thread-join! th2)
+    (thread-join! th3))
+  #f)
+	    
+;;======================================================================
+;;  S E R V E R   -  D I R E C T   C A L L S
+;;======================================================================
+
+(define (rmt:kill-server run-id)
+  (rmt:send-receive 'kill-server #f (list run-id)))
+
+(define (rmt:start-server run-id)
+  (rmt:send-receive 'start-server #f (list run-id)))
+
+(define (rmt:server-info apath dbname)
+  (rmt:send-receive 'get-server-info #f (list apath dbname)))
+
+;;======================================================================
+;; Nanomsg transport
+;;======================================================================
+
+#;(define (is-port-in-use port-num)
+  (let* ((ret #f))
+    (let-values (((inp oup pid)
+		  (process "netstat" (list  "-tulpn" ))))
+      (let loop ((inl (read-line inp)))
+        (if (not (eof-object? inl))
+            (begin 
+	      (if (string-search (regexp (conc ":" port-num)) inl)
+		  (begin
+					;(print "Output: "  inl)
+		    (set! ret  #t))
+		  (loop (read-line inp)))))))
+    ret))
+
+#;(define (open-nn-connection host-port)
+  (let ((req  (make-req-socket))
+        (uri  (conc "tcp://" host-port)))
+    (nng-dial req uri)
+    (socket-set! req 'nng/recvtimeo 2000)
+    req))
+
+#;(define (send-receive-nn req msg)
+  (nng-send req msg)
+  (nng-recv req))
+
+#;(define (close-nn-connection req)
+  (nng-close! req))
+  
+;; ;; open connection to server, send message, close connection
+;; ;;
+;; (define (open-send-close-nn host-port msg #!key (timeout 3) ) ;; default timeout is 3 seconds
+;;   (let ((req  (make-req-socket 'req))
+;;         (uri  (conc "tcp://" host-port))
+;;         (res  #f)
+;;         ;; (contacts (alist-ref 'contact attrib))
+;;         ;; (mode (alist-ref 'mode attrib))
+;; 	)
+;;     (socket-set! req 'nng/recvtimeo 2000)
+;;     (handle-exceptions
+;;      exn
+;;      (let ((emsg ((condition-property-accessor 'exn 'message) exn)))
+;;        ;; Send notification       
+;;        (debug:print 0 *default-log-port* "ERROR: Failed to connect / send to " uri " message was \"" emsg "\"" )
+;;        #f)
+;;      (nng-dial req uri)
+;;      ;; (print "Connected to the server " )
+;;      (nng-send req msg)
+;;      ;; (print "Request Sent")  
+;;      (let* ((th1  (make-thread (lambda ()
+;;                                  (let ((resp (nng-recv req)))
+;;                                    (nng-close! req)
+;;                                    (set! res (if (equal? resp "ok")
+;;                                                  #t
+;;                                                  #f))))
+;;                                "recv thread"))
+;;             (th2 (make-thread (lambda ()
+;;                                 (thread-sleep! timeout)
+;;                                 (thread-terminate! th1))
+;; 			      "timer thread")))
+;;        (thread-start! th1)
+;;        (thread-start! th2)
+;;        (thread-join! th1)
+;;        res))))
+;; 
+#;(define (open-send-receive-nn host-port msg #!key (timeout 3) ) ;; default timeout is 3 seconds
+  (let ((req  (make-req-socket))
+        (uri  (conc "tcp://" host-port))
+        (res  #f)) 
+    (handle-exceptions
+     exn
+     (let ((emsg ((condition-property-accessor 'exn 'message) exn)))
+       ;; Send notification      
+       (debug:print 0 *default-log-port* "ERROR: Failed to connect / send to " uri " message was \"" emsg "\", exn=" exn)
+       #f)
+     (nng-dial req uri)
+     (nng-send req msg)
+     (let* ((th1  (make-thread (lambda ()
+                                 (let ((resp (nng-recv req)))
+                                   (nng-close! req)
+                                   ;; (print resp)
+                                   (set! res resp)))
+                               "recv thread"))
+            (th2 (make-thread (lambda ()
+                                (thread-sleep! timeout)
+                                (thread-terminate! th1))
+                             "timer thread")))
+       (thread-start! th1)
+       (thread-start! th2)
+       (thread-join! th1)
+       res))))
+
+;;======================================================================
+;; S E R V E R   U T I L I T I E S 
+;;======================================================================
+
+;; run ping in separate process, safest way in some cases
+;;
+#;(define (server:ping-server ifaceport)
+  (with-input-from-pipe 
+   (conc (common:get-megatest-exe) " -ping " ifaceport)
+   (lambda ()
+     (let loop ((inl (read-line))
+		(res "NOREPLY"))
+       (if (eof-object? inl)
+	   (case (string->symbol res)
+	     ((NOREPLY)  #f)
+	     ((LOGIN_OK) #t)
+	     (else       #f))
+	   (loop (read-line) inl))))))
+
+;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
+;;
+#;(define (server:login toppath)
+  (lambda (toppath)
+    (set! *db-last-access* (current-seconds)) ;; might not be needed.
+    (if (equal? *toppath* toppath)
+	#t
+	#f)))
+
+;; (define server:sync-lock-token "SERVER_SYNC_LOCK")
+;; (define (server:release-sync-lock)
+;;   (db:no-sync-del! *no-sync-db* server:sync-lock-token))
+;; (define (server:have-sync-lock?)
+;;   (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token))
+;;          (have-lock?     (car have-lock-pair))
+;;          (lock-time      (cdr have-lock-pair))
+;;          (lock-age       (- (current-seconds) lock-time)))
+;;     (cond
+;;      (have-lock? #t)
+;;      ((>lock-age
+;;        (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180)))
+;;       (server:release-sync-lock)
+;;       (server:have-sync-lock?))
+;;      (else #f))))
+
+)

Index: ulex/ulex.scm
==================================================================
--- ulex/ulex.scm
+++ ulex/ulex.scm
@@ -1,8 +1,8 @@
 ;; ulex: Distributed sqlite3 db
 ;;;
-;; Copyright (C) 2018 Matt Welland
+;; Copyright (C) 2018-2021 Matt Welland
 ;; Redistribution and use in source and binary forms, with or without
 ;; modification, is permitted.
 ;;
 ;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
 ;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
@@ -23,330 +23,521 @@
 ;; NOTES:
 ;;   Why sql-de-lite and not say, dbi?  - performance mostly, then simplicity.
 ;;
 ;;======================================================================
 
-(use mailbox)
-
-(module ulex
- *
-
-(import scheme posix chicken data-structures ports extras files mailbox)
-(import srfi-18 pkts matchable regex
-	typed-records srfi-69 srfi-1
-	srfi-4 regex-case
-	(prefix sqlite3 sqlite3:)
-	foreign
-	tcp6
-	;; ulex-netutil
-	hostinfo
-	)
-
-;; make it a global? Well, it is local to area module
-
-(define *captain-pktspec*
-  `((captain (host     . h)
-	     (port     . p)
-	     (pid      . i)
-	     (ipaddr   . a)
-	     )
-    #;(data   (hostname . h)  ;; sender hostname
-	    (port     . p)  ;; sender port
-	    (ipaddr   . a)  ;; sender ip
-	    (hostkey  . k)  ;; sending host key - store info at server under this key
-	    (servkey  . s)  ;; server key - this needs to match at server end or reject the msg
-	    (format   . f)  ;; sb=serialized-base64, t=text, sx=sexpr, j=json
-	    (data     . d)  ;; base64 encoded slln data
-	    )))
-
-;; struct for keeping track of our world
-
-(defstruct udat
-  ;; captain info
-  (captain-address #f)
-  (captain-host    #f)
-  (captain-port    #f)
-  (captain-pid     #f)
-  (captain-lease   0)    ;; time (unix epoc) seconds when the lease is up
-  (ulex-dir        (conc (get-environment-variable "HOME") "/.ulex"))
-  (cpkts-dir       (conc (get-environment-variable "HOME") "/.ulex/pkts"))
-  (cpkt-spec       *captain-pktspec*)
-  ;; this processes info
-  (my-cpkt-key     #f)   ;; put Z card here when I create a pkt for myself as captain
-  (my-address      #f)
-  (my-hostname     #f)
-  (my-port         #f)
-  (my-pid          (current-process-id))
-  (my-dbs          '())
-  ;; server and handler thread
-  (serv-listener   #f)                 ;; this processes server info
-  (handler-thread  #f)
-  (mboxes          (make-hash-table))  ;; key => mbox
-  ;; other servers
-  (peers           (make-hash-table))  ;; host-port => peer record
-  (dbowners        (make-hash-table))  ;; dbfile => host-port
-  (handlers        (make-hash-table))  ;; dbfile => proc
-  ;; (outgoing-conns  (make-hash-table))  ;; host:port -> conn
-  (work-queue      (make-queue))       ;; most stuff goes here
-  ;; (fast-queue      (make-queue))       ;; super quick stuff goes here (e.g. ping)
-  (busy            #f)                 ;; is either of the queues busy, use to switch between queuing tasks or doing immediately
-  ;; app info
-  (appname         #f)
-  (dbtypes         (make-hash-table))  ;; this should be an alist but hash is easier. dbtype => [ initproc syncproc ]
-  ;; cookies
-  (cnum            0) ;; cookie num
-  )
-
-;;======================================================================
-;; NEW APPROACH
-;;======================================================================
-
-;;  start-server-find-port  ;; gotta have a server port ready from the very begining
-
-;; udata    - all the connection info, captain, server, ulex db etc. MUST BE PASSED IN
-;; dbpath   - full path and filename of the db to talk to or a symbol naming the db?
-;; callname - the remote call to execute
-;; params   - parameters to pass to the remote call
-;;
-(define (remote-call udata dbpath dbtype callname . params)
-  (start-server-find-port udata) ;; ensure we have a local server
-  (find-or-setup-captain udata)
-  ;; look at connect, process-request, send, send-receive
-  (let-values (((cookie-key host-port)(get-db-owner udata dbpath dbtype)))
-    (send-receive udata host-port callname cookie-key params)))
-
-;;======================================================================
-;; KEY FUNCTIONS - THESE ARE TOO BE EXPOSED AND USED
-;;======================================================================
-
-;; connection setup and management functions
-
-;; This is the basic setup command. Must always be
-;; called before connecting to a db using connect.
-;;
-;; find or become the captain
-;; setup and return a ulex object
-;;
-(define (find-or-setup-captain udata)
-  ;; see if we already have a captain and if the lease is ok
-  (if (and (udat-captain-address udata)
-	   (udat-captain-port    udata)
-	   (< (current-seconds) (udat-captain-lease udata)))
-      udata
-      (let* ((cpkts (get-all-captain-pkts udata)) ;; read captain pkts
-	     (captn (get-winning-pkt cpkts)))
-	(if captn
-	    (let* ((port   (alist-ref 'port   captn))
-		   (host   (alist-ref 'host   captn))
-		   (ipaddr (alist-ref 'ipaddr captn))
-		   (pid    (alist-ref 'pid    captn))
-		   (Z      (alist-ref 'Z      captn)))
-	      (udat-captain-address-set! udata ipaddr)
-	      (udat-captain-host-set!    udata host)
-	      (udat-captain-port-set!    udata port)
-	      (udat-captain-pid-set!     udata pid)
-	      (udat-captain-lease-set!   udata (+ (current-seconds) 10))
-	      (let-values (((success pingtime)(ping udata (conc ipaddr ":" port))))
-		(if success
-		    udata
-		    (begin
-		      (print "Found unreachable captain at " ipaddr ":" port ", removing pkt")
-		      (remove-captain-pkt udata captn)
-		      (find-or-setup-captain udata))))
-	      (begin
-		(setup-as-captain udata)  ;; this saves the thread to captain-thread and starts the thread
-		(find-or-setup-captain udata)))))))
-
-;; connect to a specific dbfile
-;;   - if already connected - return the dbowner host-port
-;;   - ask the captain who to talk to for this db
-;;   - put the entry in the dbowners hash as dbfile => host-port
-;;
-(define (connect udata dbfname dbtype)
-  (or (hash-table-ref/default (udat-dbowners udata) dbfname #f)
-      (let-values (((success dbowner-host-port)(get-db-owner udata dbfname dbtype)))
-	(if success
-	    (begin
-	      ;; just clobber the record, this is the new data no matter what
-	      (hash-table-set! (udat-dbowners udata) dbfname dbowner-host-port)
-	      dbowner-host-port)
-	    #f))))
-
-;; returns: success pingtime
-;;
-;; NOTE: causes the callee to store the info on this host along with the dbs this host currently owns
-;;
-(define (ping udata host-port)
-  (let* ((start  (current-milliseconds))
-	 (cookie (make-cookie udata))
-	 (dbs    (udat-my-dbs udata))
-	 (msg    (string-intersperse dbs " "))
-	 (res (send udata host-port 'ping cookie msg retval: #t))
-	 (delta (- (current-milliseconds) start)))
-    (values (equal? res cookie) delta)))
-
-;; returns: success pingtime
-;;
-;; NOTE: causes all references to this worker to be wiped out in the
-;; callee (ususally the captain)
-;;
-(define (goodbye-ping udata host-port)
-  (let* ((start  (current-milliseconds))
-	 (cookie (make-cookie udata))
-	 (dbs    (udat-my-dbs udata))
-	 (res (send udata host-port 'goodbye cookie "nomsg" retval: #t))
-	 (delta (- (current-milliseconds) start)))
-    (values (equal? res cookie) delta)))
-
-(define (goodbye-captain udata)
-  (let* ((host-port (udat-captain-host-port udata)))
-    (if host-port
-	(goodbye-ping udata host-port)
-	(values #f -1))))
-
-(define (get-db-owner udata dbname dbtype)
-  (let* ((host-port (udat-captain-host-port udata)))
-    (if host-port
-	(let* ((cookie (make-cookie udata))
-	       (msg    #f) ;; (conc dbname " " dbtype))
-	       (params `(,dbname ,dbtype))
-	       (res    (send udata host-port 'db-owner cookie msg
-			     params: params retval: #t)))
-	  (match (string-split res)
-	    ((retcookie owner-host-port)
-	     (values (equal? retcookie cookie) owner-host-port))))
-	(values #f -1))))
-
-;; called in ulex-handler to dispatch work, called on the workers side
-;;     calls (proc params data)
-;;     returns result with cookie
-;;
-;; pdat is the info of the caller, used to send the result data
-;; prockey is key into udat-handlers hash dereferencing a proc
-;; procparam is a first param handed to proc - often to do further derefrencing
-;; NOTE: params is intended to be a list of strings, encoding on data
-;;       is up to the user but data must be a single line
-;;
-(define (process-request udata pdat dbname cookie prockey procparam data)
-  (let* ((dbrec (ulex-open-db udata dbname))     ;; this will be a dbconn record, looks for in udata first
-	 (proc  (hash-table-ref udata prockey)))
-    (let* ((result (proc dbrec procparam data)))
-      result)))
-
-;; remote-request - send to remote to process in process-request
-;; uconn comes from a call to connect and can be used instead of calling connect again
-;; uconn is the host-port to call
-;; we send dbname to the worker so they know which file to open
-;; data must be a string with no newlines, it will be handed to the proc
-;; at the remote site unchanged. It is up to the user to encode/decode it's contents
-;;
-;;   rtype: immediate, read-only, normal, low-priority
-;; 
-(define (remote-request udata uconn rtype dbname prockey procparam data)
-  (let* ((cookie    (make-cookie udata)))
-    (send-receive udata uconn rtype cookie data `(,prockey procparam))))
-
-(define (ulex-open-db udata dbname)
-  #f)
-
-
-;;======================================================================
-;; Ulex db
-;;
-;;   - track who is captain, lease expire time
-;;   - track who owns what db, lease
-;;
-;;======================================================================
-
-;;
-;;
-(define (ulex-dbfname)
-  (let ((dbdir (conc (get-environment-variable "HOME") "/.ulex")))
-    (if (not (file-exists? dbdir))
-	(create-directory dbdir #t))
-    (conc dbdir "/network.db")))
-	 
-;; always goes in ~/.ulex/network.db
-;; role is captain, adjutant, node
-;;
-(define (ulexdb-setup)
-  (let* ((dbfname (ulex-dbfname))
-	 (have-db (file-exists? dbfname))
-	 (db      (sqlite3:open-database dbfname)))
-    (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))
-    (sqlite3:execute db "PRAGMA synchronous = 0;")
-    (if (not have-db)
-	(sqlite3:with-transaction
-	 db
-	 (lambda ()
-	   (for-each
-	    (lambda (stmt)
-	      (if stmt (sqlite3:execute db stmt)))
-	    `("CREATE TABLE IF NOT EXISTS nodes
-                 (id INTEGER PRIMARY KEY,
-                  role  TEXT NOT NULL,
-                  host  TEXT NOT NULL,
-                  port TEXT NOT NULL,
-                  ipadr TEXT NOT NULL,
-                  pid   INTEGER NOT NULL,
-                  zcard TEXT NOT NULL,
-                  regtime INTEGER DEFAULT (strftime('%s','now')),
-                  lease_thru INTEGER DEFAULT (strftime('%s','now')),
-                  last_update INTEGER DEFAULT (strftime('%s','now')));"
-	      "CREATE TRIGGER  IF NOT EXISTS update_nodes_trigger AFTER UPDATE ON nodes
-                             FOR EACH ROW
-                               BEGIN 
-                                 UPDATE nodes SET last_update=(strftime('%s','now'))
-                                   WHERE id=old.id;
-                               END;"
-	      "CREATE TABLE IF NOT EXISTS dbs
-                 (id INTEGER PRIMARY KEY,
-                  dbname TEXT NOT NULL,
-                  dbfile TEXT NOT NULL,
-                  dbtype TEXT NOT NULL,
-                  host_port TEXT NOT NULL,
-                  regtime INTEGER DEFAULT (strftime('%s','now')),
-                  lease_thru INTEGER DEFAULT (strftime('%s','now')),
-                  last_update INTEGER DEFAULT (strftime('%s','now')));"
-	      "CREATE TRIGGER  IF NOT EXISTS update_dbs_trigger AFTER UPDATE ON dbs
-                             FOR EACH ROW
-                               BEGIN 
-                                 UPDATE dbs SET last_update=(strftime('%s','now'))
-                                   WHERE id=old.id;
-                               END;")))))
-    db))
-
-(define (get-host-port-lease db dbfname)
-  (sqlite3:fold-row
-   (lambda (rem host-port lease-thru)
-     (list host-port lease-thru))
-   #f db "SELECT host_port,lease_thru FROM dbs WHERE dbfile = ?" dbfname))
-  
-(define (register-captain db host ipadr port pid zcard #!key (lease 20))
-  (let* ((dbfname (ulex-dbfname))
-	 (host-port  (conc host ":" port)))
-    (sqlite3:with-transaction
-     db
-     (lambda ()
-       (match (get-host-port-lease db dbfname)
-	 ((host-port lease-thru)
-	  (if (> (current-seconds) lease-thru)
-	      (begin
-		(sqlite3:execute db "UPDATE dbs SET host_port=?,lease_thru=? WHERE dbname=?"
-				 (conc host ":" port)
-				 (+ (current-seconds) lease)
-				 dbfname)
-		#t)
-	      #f))
-	 (#f  (sqlite3:execute db "INSERT INTO dbs (dbname,dbfile,dbtype,host_port,lease_thru) VALUES (?,?,?,?,?)"
-			       "captain" dbfname "captain" host-port (+ (current-seconds) lease)))
-	 (else (print "ERROR: Unrecognised result from fold-row")
-	       (exit 1)))))))
-							    
+(module ulex
+	*
+	#;(
+     
+     ;; NOTE: looking for the handler proc - find the run-listener :)
+     
+     run-listener     ;; (run-listener handler-proc [port]) => uconn
+
+     ;; NOTE: handler-proc params;
+     ;;       (handler-proc rem-host-port qrykey cmd params)
+
+     send-receive     ;; (send-receive uconn host-port cmd data)
+
+     ;; NOTE: cmd can be any plain text symbol except for these;
+     ;;         'ping 'ack 'goodbye 'response
+     
+     set-work-handler ;; (set-work-handler proc)
+
+     wait-and-close   ;; (wait-and-close uconn)
+
+     ulex-listener?
+     
+     ;; needed to get the interface:port that was automatically found
+     udat-port
+     udat-host-port
+     
+     ;; for testing only
+     ;; pp-uconn
+     
+     ;; parameters
+     work-method   ;; parameter; 'threads, 'mailbox, 'limited, 'direct
+     return-method ;; parameter; 'mailbox, 'polling, 'direct
+     )
+
+(import scheme
+	chicken.base
+	chicken.file
+	chicken.io
+	chicken.time
+	chicken.condition
+	chicken.string
+	chicken.sort
+	chicken.pretty-print
+	
+	address-info
+	mailbox
+	matchable
+	;; queues
+	regex
+	regex-case
+	simple-exceptions
+	s11n
+	srfi-1
+	srfi-18
+	srfi-4
+	srfi-69
+	system-information
+	tcp6
+	typed-records
+	)
+
+;; udat struct, used by both caller and callee
+;; instantiated as uconn by convention
+;;
+(defstruct udat
+  ;; the listener side
+  (port #f)
+  (host-port #f)
+  (socket #f)
+  ;; the peers
+  (peers  (make-hash-table)) ;; host:port->peer
+  ;; work handling
+  (work-queue (make-mailbox))
+  (work-proc  #f)                ;; set by user
+  (cnum       0)                 ;; cookie number
+  (mboxes     (make-hash-table)) ;; for the replies
+  (avail-cmboxes '())            ;; list of (<cookie> . <mbox>) for re-use
+  ;; threads
+  (numthreads 10)
+  (cmd-thread #f)
+  (work-queue-thread #f)
+  (num-threads-running 0)
+  ) 
+
+;; Parameters
+
+;; work-method:
+(define work-method (make-parameter 'mailbox))
+;;    mailbox - all rdat goes through mailbox
+;;    threads - all rdat immediately executed in new thread
+;;    direct  - no queuing
+;;
+
+;; return-method, return the result to waiting send-receive:
+(define return-method (make-parameter 'mailbox))
+;;    mailbox - create a mailbox and use it for passing returning results to send-receive
+;;    polling - put the result in a hash table keyed by qrykey and send-receive can poll it for result
+;;    direct  - no queuing, result is passed back in single tcp connection
+;;
+
+;; ;; struct for keeping track of others we are talking to
+;; ;;
+;; (defstruct pdat
+;;   (host-port  #f)
+;;   (conns      '()) ;; list of pcon structs, pop one off when calling the peer
+;;   )
+;; 
+;; ;; struct for peer connections, keep track of expiration etc.
+;; ;;
+;; (defstruct pcon
+;;   (inp #f)
+;;   (oup #f)
+;;   (exp (+ (current-seconds) 59)) ;; expires at this time, set to (+ (current-seconds) 59)
+;;   (lifetime (+ (current-seconds) 600)) ;; throw away and create new after five minutes
+;;   )
+
+;;======================================================================
+;; listener
+;;======================================================================
+
+;; is uconn a ulex connector (listener)
+;;
+(define (ulex-listener? uconn)
+  (udat? uconn))
+
+;; create a tcp listener and return a populated udat struct with
+;; my port, address, hostname, pid etc.
+;; return #f if fail to find a port to allocate.
+;;
+;;  if udata-in is #f create the record
+;;  if there is already a serv-listener return the udata
+;;
+(define (setup-listener uconn #!optional (port 4242))
+  (handle-exceptions
+   exn
+   (if (< port 65535)
+       (setup-listener uconn (+ port 1))
+       #f)
+   (connect-listener uconn port)))
+
+(define (connect-listener uconn port)
+  ;; (tcp-listener-socket LISTENER)(socket-name so)
+  ;; sockaddr-address, sockaddr-port, sockaddr->string
+  (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
+	 (addr (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))
+    (udat-port-set!      uconn port)
+    (udat-host-port-set! uconn (conc addr":"port))
+    (udat-socket-set!    uconn tlsn)
+    uconn))
+
+;; run-listener does all the work of starting a listener in a thread
+;; it then returns control
+;;
+(define (run-listener handler-proc #!optional (port-suggestion 4242))
+  (let* ((uconn (make-udat)))
+    (udat-work-proc-set! uconn handler-proc)
+    (if (setup-listener uconn port-suggestion)
+	(let* ((th1 (make-thread (lambda ()(ulex-cmd-loop uconn)) "Ulex command loop"))
+	       (th2 (make-thread (lambda ()
+				   (case (work-method)
+				     ((mailbox limited)
+				      (process-work-queue uconn))))
+				 "Ulex work queue processor")))
+	  ;; (tcp-buffer-size 2048)
+	  (thread-start! th1)
+	  (thread-start! th2)
+	  (udat-cmd-thread-set! uconn th1)
+	  (udat-work-queue-thread-set! uconn th2)
+	  (print "cmd loop and process workers started, listening on "(udat-host-port uconn)".")
+	  uconn)
+	(assert #f "ERROR: run-listener called without proper setup."))))
+
+(define (wait-and-close uconn)
+  (thread-join! (udat-cmd-thread uconn))
+  (tcp-close (udat-socket uconn)))
+
+;;======================================================================
+;; peers and connections
+;;======================================================================
+
+(define *send-mutex* (make-mutex))
+
+;; send structured data to recipient
+;;
+;;  NOTE: qrykey is what was called the "cookie" previously
+;;
+;;     retval tells send to expect and wait for return data (one line) and return it or time out
+;;       this is for ping where we don't want to necessarily have set up our own server yet.
+;;
+;; NOTE: see below for beginnings of code to allow re-use of tcp connections
+;;        - I believe (without substantial evidence) that re-using connections will
+;;          be beneficial ...
+;;
+(define (send udata host-port qrykey cmd params)
+  (let* ((my-host-port (udat-host-port udata))          ;; remote will return to this
+	 (isme         #f #;(equal? host-port my-host-port)) ;; calling myself?
+	 ;; dat is a self-contained work block that can be sent or handled locally
+	 (dat          (list my-host-port qrykey cmd params #;(cons (current-seconds)(current-milliseconds)))))
+    (cond
+     (isme (ulex-handler udata dat)) ;; no transmission needed
+     (else
+      (handle-exceptions ;; TODO - MAKE THIS EXCEPTION CMD SPECIFIC?
+	  exn
+	  (message exn)
+	(begin
+	  ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+	  (let-values (((inp oup)(tcp-connect host-port)))
+	    (let ((res (if (and inp oup)
+			   (begin
+			     (serialize dat oup)
+			     (close-output-port oup)
+			     (deserialize inp)
+			     )
+			   (begin
+			     (print "ERROR: send called but no receiver has been setup. Please call setup first!")
+			     #f))))
+	      (close-input-port inp)
+	      ;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+	      res)))))))) ;; res will always be 'ack unless return-method is direct
+
+(define (send-via-polling uconn host-port cmd data)
+  (let* ((qrykey (make-cookie uconn))
+	 (sres   (send uconn host-port qrykey cmd data)))
+    (case sres
+      ((ack)
+       (let loop ((start-time (current-milliseconds)))
+	 (if (> (current-milliseconds)(+ start-time 10000)) ;; ten seconds timeout
+	     (begin
+	       (print "ULEX ERROR: timed out waiting for response from "host-port", "cmd" "data)
+	       #f)
+	     (let* ((result (hash-table-ref/default (udat-mboxes uconn) qrykey #f))) ;; NOTE: we are re-using mboxes hash
+	       (if result ;; result is '(status . result-data) or #f for nothing yet
+		   (begin
+		     (hash-table-delete! (udat-mboxes uconn) qrykey)
+		     (cdr result))
+		   (begin
+		     (thread-sleep! 0.01)
+		     (loop start-time)))))))
+      (else
+       (print "ULEX ERROR: Communication failed? sres="sres)
+       #f))))
+
+(define (send-via-mailbox uconn host-port cmd data)
+  (let* ((cmbox     (get-cmbox uconn)) ;; would it be better to keep a stack of mboxes to reuse?
+	 (qrykey    (car cmbox))
+	 (mbox      (cdr cmbox))
+	 (mbox-time (current-milliseconds))
+	 (sres      (send uconn host-port qrykey cmd data))) ;; short res
+    (if (eq? sres 'ack) ;; BUG: change to be less than server:expiration-timeout?
+	(let* ((mbox-timeout-secs    120 #;(if (eq? 'primordial (thread-name (current-thread)))
+				     #f
+				     120)) ;; timeout)
+	       (mbox-timeout-result 'MBOX_TIMEOUT)
+	       (res                  (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result))
+	       (mbox-receive-time    (current-milliseconds)))
+	  ;; (put-cmbox uconn cmbox) ;; reuse mbox and cookie. is it worth it?
+	  (hash-table-delete! (udat-mboxes uconn) qrykey)
+	  (if (eq? res 'MBOX_TIMEOUT)
+	      (begin
+		(print "WARNING: mbox timed out for query "cmd", with data "data
+		       ", waiting for response from "host-port".")
+
+		;; here it might make sense to clean up connection records and force clean start?
+		;; NO. The progam using ulex needs to do the reset. Right thing here is exception
+		
+		#f)  ;; convert to raising exception?
+	      res))
+	(begin
+	  (print "ERROR: Communication failed? Got "sres)
+	  #f))))
+  
+;; send a request to the given host-port and register a mailbox in udata
+;; wait for the mailbox data and return it
+;;
+(define (send-receive uconn host-port cmd data)
+  (let* ((start-time (current-milliseconds))
+	 (result     (cond
+		      ((member cmd '(ping goodbye)) ;; these are immediate
+		       (send uconn host-port 'ping cmd data))
+		      ((eq? (work-method) 'direct)
+		       ;; the result from send will be the actual result, not an 'ack
+		       (send uconn host-port 'direct cmd data))
+		      (else
+		       (case (return-method)
+			 ((polling)
+			  (send-via-polling uconn host-port cmd data))
+			 ((mailbox) 
+			  (send-via-mailbox uconn host-port cmd data))
+			 (else
+			  (print "ULEX ERROR: unrecognised return-method "(return-method)".")
+			  #f)))))
+	 (duration    (- (current-milliseconds) start-time)))
+    ;; this is ONLY for development and debugging. It will be removed once Ulex is stable.
+    (if (< 5000 duration)
+	(print "ULEX WARNING: round-trip took "(inexact->exact (round (/ duration 1000)))
+	       " seconds; "cmd", host-port="host-port", data="data))
+    result))
+    
+
+;;======================================================================
+;; responder side
+;;======================================================================
+
+;; take a request, rdat, and if not immediate put it in the work queue
+;;
+;; Reserved cmds; ack ping goodbye response
+;;
+(define (ulex-handler uconn rdat)
+  (assert (list? rdat) "FATAL: ulex-handler give rdat as not list")
+  (match rdat ;;  (string-split controldat)
+    ((rem-host-port qrykey cmd params);; timedata)
+     ;; (print "ulex-handler got: "rem-host-port" qrykey: "qrykey" cmd: "cmd" params: "params)
+     (case cmd
+       ;; ((ack )(print "Got ack! But why? Should NOT get here.") 'ack)
+       ((ping)
+	;; (print "Got Ping!")
+	;; (add-to-work-queue uconn rdat)
+	'ack)
+       ((goodbye)
+	;; just clear out references to the caller. NOT COMPLETE
+	(add-to-work-queue uconn rdat)
+	'ack)
+       ((response) ;; this is a result from remote processing, send it as mail ...
+	(case (return-method)
+	  ((polling)
+	   (hash-table-set! (udat-mboxes uconn) qrykey (cons 'ok params))
+	   'ack)
+	  ((mailbox)
+	   (let ((mbox (hash-table-ref/default (udat-mboxes uconn) qrykey #f)))
+	     (if mbox
+		 (begin
+		   (mailbox-send! mbox params) ;; params here is our result
+		   'ack)
+		 (begin
+		   (print "ERROR: received result but no associated mbox for cookie "qrykey)
+		   'no-mbox-found))))
+	  (else (print "ULEX ERROR: unrecognised return-method "(return-method))
+		'bad-return-method)))
+       (else ;; generic request - hand it to the work queue
+	(add-to-work-queue uconn rdat)
+	'ack)))
+    (else
+     (print "ULEX ERROR: bad rdat "rdat)
+     'bad-rdat)))
+
+;; given an already set up uconn start the cmd-loop
+;;
+(define (ulex-cmd-loop uconn)
+  (let* ((serv-listener (udat-socket uconn))
+	 (listener      (lambda ()
+			  (let loop ((state 'start))
+			    (let-values (((inp oup)(tcp-accept serv-listener)))
+			      ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+			      (let* ((rdat  (deserialize inp)) ;; '(my-host-port qrykey cmd params)
+				     (resp  (ulex-handler uconn rdat)))
+				(serialize resp oup)
+				(close-input-port inp)
+				(close-output-port oup)
+				;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP
+				)
+			      (loop state))))))
+    ;; start N of them
+    (let loop ((thnum   0)
+	       (threads '()))
+      (if (< thnum 100)
+	  (let* ((th (make-thread listener (conc "listener" thnum))))
+	    (thread-start! th)
+	    (loop (+ thnum 1)
+		  (cons th threads)))
+	  (map thread-join! threads)))))
+
+;; add a proc to the cmd list, these are done symetrically (i.e. in all instances)
+;; so that the proc can be dereferenced remotely
+;;
+(define (set-work-handler uconn proc)
+  (udat-work-proc-set! uconn proc))
+
+;;======================================================================
+;; work queues - this is all happening on the listener side
+;;======================================================================
+
+;; rdat is (rem-host-port qrykey cmd params)
+					     
+(define (add-to-work-queue uconn rdat)
+  #;(queue-add! (udat-work-queue uconn) rdat)
+  (case (work-method)
+    ((threads)
+     (thread-start! (make-thread (lambda ()
+				   (do-work uconn rdat))
+				 "worker thread")))
+    ((mailbox)
+     (mailbox-send! (udat-work-queue uconn) rdat))
+    ((direct)
+     (do-work uconn rdat))
+    (else
+     (print "ULEX ERROR: work-method "(work-method)" not recognised, using mailbox.")
+     (mailbox-send! (udat-work-queue uconn) rdat))))
+     
+;; move the logic to return the result somewhere else?
+;;
+(define (do-work uconn rdat)
+  (let* ((proc (udat-work-proc uconn))) ;; get it each time - conceivebly it could change
+    ;; put this following into a do-work procedure
+    (match rdat
+      ((rem-host-port qrykey cmd params)
+       (let* ((start-time (current-milliseconds))
+	      (result (proc rem-host-port qrykey cmd params))
+	      (end-time (current-milliseconds))
+	      (run-time (- end-time start-time)))
+	 (case (work-method)
+	   ((direct) result)
+	   (else
+	    (if (> run-time 1000)(print "ULEX: Warning, work "cmd", "params" done in "run-time" ms"))
+	    ;; send 'response as cmd and result as params
+	    (send uconn rem-host-port qrykey 'response result) ;; could check for ack
+	    (let* ((duration (- (current-milliseconds) end-time)))
+	      (if (> duration 500)(print "ULEX: Warning, response sent back to "rem-host-port" for "qrykey" in "duration"ms")))))))
+      (MBOX_TIMEOUT 'do-work-timeout)
+      (else
+       (print "ERROR: rdat "rdat", did not match rem-host-port qrykey cmd params")))))
+
+;; NEW APPROACH:
+;;   
+(define (process-work-queue uconn) 
+  (let ((wqueue (udat-work-queue uconn))
+	(proc   (udat-work-proc  uconn))
+	(numthr (udat-numthreads uconn)))
+    (let loop ((thnum    1)
+	       (threads '()))
+      (let ((thlst (cons (make-thread (lambda ()
+					(let work-loop ()
+					  (let ((rdat (mailbox-receive! wqueue 24000 'MBOX_TIMEOUT)))
+					    (do-work uconn rdat))
+					  (work-loop)))
+				      (conc "work thread " thnum))
+			 threads)))
+	(if (< thnum numthr)
+	    (loop (+ thnum 1)
+		  thlst)
+	    (begin
+	      (print "ULEX: Starting "(length thlst)" worker threads.")
+	      (map thread-start! thlst)
+	      (print "ULEX: Threads started. Joining all.")
+	      (map thread-join! thlst)))))))
+
+;; below was to enable re-use of connections. This seems non-trivial so for
+;; now lets open on each call
+;;
+;; ;; given host-port get or create peer struct
+;; ;;
+;; (define (udat-get-peer uconn host-port)
+;;   (or (hash-table-ref/default (udat-peers uconn) host-port #f)
+;;       ;; no peer, so create pdat and init it
+;;       
+;;       ;; NEED stack of connections, pop and use; inp, oup,
+;;       ;; creation_time (remove and create new if over 24hrs old
+;;       ;; 
+;;       (let ((pdat (make-pdat host-port: host-port)))
+;; 	(hash-table-set! (udat-peers uconn) host-port pdat)
+;; 	pdat)))
+;; 
+;; ;; is pcon alive
+;; 
+;; ;; given host-port and pdat get a pcon
+;; ;;
+;; (define (pdat-get-pcon pdat host-port)
+;;   (let loop ((conns (pdat-conns pdat)))
+;;     (if (null? conns) ;; none? make and return - do NOT add - it will be pushed back on list later
+;; 	(init-pcon (make-pcon))
+;; 	(let* ((conn (pop conns)))
+;; 	  
+;; ;; given host-port get a pcon struct
+;; ;;
+;; (define (udat-get-pcon 
+      
+;;======================================================================
+;; misc utils
+;;======================================================================
+
+(define (make-cookie uconn)
+  (let ((newcnum (+ (udat-cnum uconn) 1)))
+    (udat-cnum-set! uconn newcnum)
+    (conc (udat-host-port uconn) ":"
+	  newcnum)))
+
+;; cookie/mboxes
+
+;; we store each mbox with a cookie (<cookie> . <mbox>)
+;;
+(define (get-cmbox uconn)
+  (if (null? (udat-avail-cmboxes uconn))
+      (let ((cookie (make-cookie uconn))
+	    (mbox   (make-mailbox)))
+	(hash-table-set! (udat-mboxes uconn) cookie mbox)
+	`(,cookie . ,mbox))
+      (let ((cmbox (car (udat-avail-cmboxes uconn))))
+	(udat-avail-cmboxes-set! uconn (cdr (udat-avail-cmboxes uconn)))
+	cmbox)))
+
+(define (put-cmbox uconn cmbox)
+  (udat-avail-cmboxes-set! uconn (cons cmbox (udat-avail-cmboxes uconn))))
+
+(define (pp-uconn uconn)
+  (pp (udat->alist uconn)))
+
+  
 ;;======================================================================
 ;; network utilities
 ;;======================================================================
+
+;; NOTE: Look at address-info egg as alternative to some of this
 
 (define (rate-ip ipaddr)
   (regex-case ipaddr
     ( "^127\\..*" _ 0 )
     ( "^(10\\.0|192\\.168)\\..*" _ 1 )
@@ -354,1899 +545,26 @@
 
 ;; Change this to bias for addresses with a reasonable broadcast value?
 ;;
 (define (ip-pref-less? a b)
   (> (rate-ip a) (rate-ip b)))
-  
 
 (define (get-my-best-address)
-  (let ((all-my-addresses (get-all-ips))
-        ;;(all-my-addresses-old (vector->list (hostinfo-addresses (hostname->hostinfo (get-host-name)))))
-        )
+  (let ((all-my-addresses (get-all-ips)))
     (cond
      ((null? all-my-addresses)
       (get-host-name))                                          ;; no interfaces?
      ((eq? (length all-my-addresses) 1)
       (car all-my-addresses))                      ;; only one to choose from, just go with it
-     
      (else
-      (car (sort all-my-addresses ip-pref-less?)))
-     ;; (else 
-     ;;  (ip->string (car (filter (lambda (x)                      ;; take any but 127.
-     ;;    			 (not (eq? (u8vector-ref x 0) 127)))
-     ;;    		       all-my-addresses))))
-
-     )))
+      (car (sort all-my-addresses ip-pref-less?))))))
 
 (define (get-all-ips-sorted)
   (sort (get-all-ips) ip-pref-less?))
 
 (define (get-all-ips)
-  (map ip->string (vector->list 
-		   (hostinfo-addresses
-		    (host-information (current-hostname))))))
-
-(define (udat-my-host-port udata)
-  (if (and (udat-my-address udata)(udat-my-port udata))
-      (conc (udat-my-address udata) ":" (udat-my-port udata))
-      #f))
-
-(define (udat-captain-host-port udata)
-  (if (and (udat-captain-address udata)(udat-captain-port udata))
-      (conc (udat-captain-address udata) ":" (udat-captain-port udata))
-      #f))
-
-(define (udat-get-peer udata host-port)
-  (hash-table-ref/default (udat-peers udata) host-port #f))
-
-;; struct for keeping track of others we are talking to
-
-(defstruct peer
-  (addr-port       #f)
-  (hostname        #f)
-  (pid             #f)
-  ;; (inp             #f)
-  ;; (oup             #f)
-  (dbs            '()) ;; list of databases this peer is currently handling
-  )
-
-(defstruct work
-  (peer-dat   #f)
-  (handlerkey #f)
-  (qrykey     #f)
-  (data       #f)
-  (start      (current-milliseconds)))
-
-#;(defstruct dbowner
-  (pdat        #f)
-  (last-update (current-seconds)))
-
-;;======================================================================
-;; Captain functions
-;;======================================================================
-
-;; NB// This needs to be started in a thread
-;;
-;; setup to be a captain
-;;   - local server MUST be started already
-;;   - create pkt
-;;   - start server port handler
-;;
-(define (setup-as-captain udata)
-  (if (create-captain-pkt udata)
-      (let* ((my-addr (udat-my-address udata))
-	     (my-port (udat-my-port    udata))
-	     (th (make-thread (lambda ()
-				(ulex-handler-loop udata)) "Captain handler")))
-	(udat-handler-thread-set! udata th)
-	(udat-captain-address-set! udata my-addr)
-	(udat-captain-port-set!    udata my-port)
-	(thread-start! th))
-      (begin
-	(print "ERROR: failed to create captain pkt")
-	#f)))
-
-;; given a pkts dir read 
-;;
-(define (get-all-captain-pkts udata)
-  (let* ((pktsdir       (let ((d (udat-cpkts-dir udata)))
-			  (if (file-exists? d)
-			      d
-			      (begin
-				(create-directory d #t)
-				d))))
-	 (all-pkt-files (glob (conc pktsdir "/*.pkt")))
-	 (pkt-spec      (udat-cpkt-spec udata)))
-    (map (lambda (pkt-file)
-	   (read-pkt->alist pkt-file pktspec: pkt-spec))
-	 all-pkt-files)))
-
-;; sort by D then Z, return one, choose the oldest then
-;; differentiate if needed using the Z key
-;;l
-(define (get-winning-pkt pkts)
-  (if (null? pkts)
-      #f
-      (car (sort pkts (lambda (a b)
-			(let ((ad (string->number (alist-ref 'D a)))
-			      (bd (string->number (alist-ref 'D b))))
-			  (if (eq? a b)
-			      (let ((az (alist-ref 'Z a))
-				    (bz (alist-ref 'Z b)))
-				(string>=? az bz))
-			      (> ad bd))))))))
-
-;; put the host, ip, port and pid into a pkt in
-;; the captain pkts dir
-;;  - assumes user has already fired up a server
-;;    which will be in the udata struct
-;;
-(define (create-captain-pkt udata)
-  (if (not (udat-serv-listener udata))
-      (begin
-	(print "ERROR: create-captain-pkt called with out a listener")
-	#f)
-      (let* ((pktdat `((port   . ,(udat-my-port udata))
-		       (host   . ,(udat-my-hostname udata))
-		       (ipaddr . ,(udat-my-address udata))
-		       (pid    . ,(udat-my-pid     udata))))
-	     (pktdir  (udat-cpkts-dir udata))
-	     (pktspec (udat-cpkt-spec udata))
-	     )
-	(udat-my-cpkt-key-set!
-	 udata
-	 (write-alist->pkt
-	  pktdir
-	  pktdat
-	  pktspec: pktspec
-	  ptype:   'captain))
-	(udat-my-cpkt-key udata))))
-
-;; remove pkt associated with captn (the Z key .pkt)
-;;
-(define (remove-captain-pkt udata captn)
-  (let ((Z       (alist-ref 'Z captn))
-	(cpktdir (udat-cpkts-dir udata)))
-    (delete-file* (conc cpktdir "/" Z ".pkt"))))
-
-;; call all known peers and tell them to delete their info on the captain
-;; thus forcing them to re-read pkts and connect to a new captain
-;; call this when the captain needs to exit and if an older captain is
-;; detected. Due to delays in sending file meta data in NFS multiple
-;; captains can be initiated in a "Storm of Captains", book soon to be
-;; on Amazon
-;;
-(define (drop-captain udata)
-  (let* ((peers (hash-table-keys (udat-peers udata)))
-	 (cookie (make-cookie udata)))
-    (for-each
-     (lambda (host-port)
-       (send udata host-port 'dropcaptain cookie "nomsg" retval: #t))
-     peers)))
-
-;;======================================================================
-;; server primitives
-;;======================================================================
-
-(define (make-cookie udata)
-  (let ((newcnum (+ (udat-cnum udata) 1)))
-    (udat-cnum-set! udata newcnum)
-    (conc (udat-my-address udata) ":"
-	  (udat-my-port    udata) "-"
-	  (udat-my-pid     udata) "-"
-	  newcnum)))
-
-;; create a tcp listener and return a populated udat struct with
-;; my port, address, hostname, pid etc.
-;; return #f if fail to find a port to allocate.
-;;
-;;  if udata-in is #f create the record
-;;  if there is already a serv-listener return the udata
-;;
-(define (start-server-find-port udata-in #!optional (port 4242))
-  (let ((udata (or udata-in (make-udat))))
-    (if (udat-serv-listener udata) ;; TODO - add check that the listener is alive and ready?
-	udata
-	(handle-exceptions
-	    exn
-	  (if (< port 65535)
-	      (start-server-find-port udata (+ port 1))
-	      #f)
-	  (connect-server udata port)))))
-
-(define (connect-server udata port)
-  ;; (tcp-listener-socket LISTENER)(socket-name so)
-  ;; sockaddr-address, sockaddr-port, sockaddr->string
-  (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]])
-	 (addr (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname)))
-    (udat-my-address-set!    udata addr)
-    (udat-my-port-set!       udata port)
-    (udat-my-hostname-set!   udata (get-host-name))
-    (udat-serv-listener-set! udata tlsn)
-    udata))
-
-(define (get-peer-dat udata host-port #!optional (hostname #f)(pid #f))
-  (let* ((pdat (or (udat-get-peer udata host-port)
-		   (handle-exceptions ;; ERROR - MAKE THIS EXCEPTION HANDLER MORE SPECIFIC
-		    exn
-		    #f
-		    (let ((npdat (make-peer addr-port: host-port)))
-		      (if hostname (peer-hostname-set! npdat hostname))
-		      (if pid (peer-pid-set! npdat pid))
-		      npdat)))))
-    pdat))
-
-;; send structured data to recipient
-;;
-;;  NOTE: qrykey is what was called the "cookie" previously
-;;
-;;     retval tells send to expect and wait for return data (one line) and return it or time out
-;;       this is for ping where we don't want to necessarily have set up our own server yet.
-;;
-(define (send udata host-port handler qrykey data
-	      #!key (hostname #f)(pid #f)(params '())(retval #f))
-  (let* ((my-host-port (udat-my-host-port udata))
-	 (isme         (equal? host-port my-host-port)) ;; am I calling
-							;; myself?
-	 (dat          (list
-			handler              ;; " "
-			my-host-port         ;; " "
-			(udat-my-pid  udata) ;; " "
-			qrykey
-			params ;;(if (null? params) "" (conc " "
-			       ;;(string-intersperse params " ")))
-			)))
-    ;; (print "send isme is " (if isme "true!" "false!") ",
-    ;; my-host-port: " my-host-port ", host-port: " host-port)
-    (if isme
-	(ulex-handler udata dat data)
-	(handle-exceptions ;; ERROR - MAKE THIS EXCEPTION HANDLER MORE
-			   ;; SPECIFIC
-	    exn
-	    #f 
-	  (let-values (((inp oup)(tcp-connect host-port)))
-	    ;;
-	    ;; CONTROL LINE:
-	    ;;    handlerkey host:port pid qrykey params ...
-	    ;;
-	    (let ((res
-		   (if (and inp oup)
-		       (let* ()
-			 (if my-host-port
-			     (begin
-			       (write dat  oup)
-			       (write data oup) ;; send as sexpr
-			       ;; (print "Sent dat: " dat " data: " data)
-			       (if retval
-				   (read inp)
-				   #t))
-			     (begin
-			       (print "ERROR: send called but no receiver has been setup. Please call setup first!")
-			       #f))
-			 ;; NOTE: DO NOT BE TEMPTED TO LOOK AT ANY DATA ON INP HERE!
-			 ;;       (there is a listener for handling that)
-			 )
-		       #f))) ;; #f means failed to connect and send
-	      (close-input-port inp)
-	      (close-output-port oup)
-	      res))))))
-
-;; send a request to the given host-port and register a mailbox in udata
-;; wait for the mailbox data and return it
-;;
-(define (send-receive udata host-port handler qrykey data #!key (hostname #f)(pid #f)(params '())(timeout 20))
-  (let ((mbox      (make-mailbox))
-	(mbox-time (current-milliseconds))
-	(mboxes    (udat-mboxes udata)))
-    (hash-table-set! mboxes qrykey mbox)
-    (if (send udata host-port handler qrykey data hostname: hostname pid: pid params: params)
-	(let* ((mbox-timeout-secs    timeout)
-	       (mbox-timeout-result 'MBOX_TIMEOUT)
-	       (res                  (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result))
-	       (mbox-receive-time    (current-milliseconds)))
-	  (hash-table-delete! mboxes qrykey)
-	  (if (eq? res 'MBOX_TIMEOUT)
-	      #f
-	      res))
-	#f))) ;; #f means failed to communicate
-
-;; 
-(define (ulex-handler udata controldat data)
-  (print "controldat: " controldat " data: " data)
-  (match controldat ;;  (string-split controldat)
-    ((handlerkey host-port pid qrykey params ...)
-     ;; (print "handlerkey: " handlerkey " host-port: " host-port " pid: " pid " qrykey: " qrykey " params: " params)
-     (case handlerkey ;; (string->symbol handlerkey)
-       ((ack)(print "Got ack!"))
-       ((ping) ;; special case - return result immediately on the same connection
-	(let* ((proc  (hash-table-ref/default (udat-handlers udata) 'ping #f))
-	       (val   (if proc (proc) "gotping"))
-	       (peer  (make-peer addr-port: host-port pid: pid))
-	       (dbshash (udat-dbowners udata)))
-	  (peer-dbs-set! peer params) ;; params for ping is list of dbs owned by pinger
-	  (for-each (lambda (dbfile)
-		      (hash-table-set! dbshash dbfile host-port)) ;; WRONG?
-		    params) ;; register each db in the dbshash
-	  (if (not (hash-table-exists? (udat-peers udata) host-port))
-	      (hash-table-set! (udat-peers udata) host-port peer)) ;; save the details of this caller in peers
-	  qrykey)) ;; End of ping
-       ((goodbye)
-	;; remove all traces of the caller in db ownership etc.
-	(let* ((peer  (hash-table-ref/default (udat-peers udata) host-port #f))
-	       (dbs   (if peer (peer-dbs peer) '()))
-	       (dbshash (udat-dbowners udata)))
-	  (for-each (lambda (dbfile)(hash-table-delete! dbshash dbfile)) dbs)
-	  (hash-table-delete! (udat-peers udata) host-port)
-	  qrykey))
-       ((dropcaptain)
-	;; remove all traces of the captain
-	(udat-captain-address-set! udata #f)
-	(udat-captain-host-set!    udata #f)
-	(udat-captain-port-set!    udata #f)
-	(udat-captain-pid-set!     udata #f)
-	qrykey)
-       ((rucaptain) ;; remote is asking if I'm the captain
-	(if (udat-my-cpkt-key udata) "yes" "no"))
-       ((db-owner) ;; given a db name who do I send my queries to
-	;; look up the file in handlers, if have an entry ping them to be sure
-	;; they are still alive and then return that host:port.
-	;; if no handler found or if the ping fails pick from peers the oldest that
-	;; is managing the fewest dbs
-	(match params
-	  ((dbfile dbtype)
-	   (let* ((owner-host-port (hash-table-ref/default (udat-dbowners udata) dbfile #f)))
-	     (if owner-host-port
-		 (conc qrykey " " owner-host-port)
-		 (let* ((pdat (or (hash-table-ref/default (udat-peers udata) host-port #f) ;; no owner - caller gets to own it!
-				  (make-peer addr-port: host-port pid: pid dbs: `(,dbfile)))))
-		   (hash-table-set! (udat-peers udata) host-port pdat)
-		   (hash-table-set! (udat-dbowners udata) dbfile host-port)
-		   (conc qrykey " " host-port)))))
-	  (else (conc qrykey " BADDATA"))))
-       ;; for work items:
-       ;;    handler is one of; immediate, read-only, read-write, high-priority
-       ((immediate read-only normal low-priority) ;; do this work immediately
-	;; host-port (caller), pid (caller), qrykey (cookie), params <= all from first line
-	;; data => a single line encoded however you want, or should I build json into it?
-	(print "handlerkey=" handlerkey)
-	(let* ((pdat (get-peer-dat udata host-port)))
-	  (match params ;; dbfile prockey procparam
-	    ((dbfile prockey procparam)
-	     (case handlerkey
-	       ((immediate read-only)
-		(process-request udata pdat dbfile qrykey prockey procparam data))
-	       ((normal low-priority) ;; split off later and add logic to support low priority
-		(add-to-work-queue udata pdat dbfile qrykey prockey procparam data))
-	       (else
-		#f)))
-	    (else
-	     (print "INFO: params=" params " handlerkey=" handlerkey " controldat=" controldat)
-	     #f))))
-       (else
-	;; (add-to-work-queue udata (get-peer-dat udata host-port) handlerkey qrykey data)
-	#f)))
-    (else
-     (print "BAD DATA? controldat=" controldat " data=" data)
-     #f)));; handles the incoming messages and dispatches to queues
-
-;;
-(define (ulex-handler-loop udata)
-  (let* ((serv-listener (udat-serv-listener udata)))
-    ;; data comes as two lines
-    ;;   handlerkey resp-addr:resp-port hostname pid qrykey [dbpath/dbfile.db]
-    ;;   data
-    (let loop ((state 'start))
-      (let-values (((inp oup)(tcp-accept serv-listener)))
-	(let* ((controldat (read inp))
-	       (data       (read inp))
-	       (resp       (ulex-handler udata controldat data)))
-	  (if resp (write resp oup))
-	  (close-input-port inp)
-	  (close-output-port oup))
-	(loop state)))))
-
-;; add a proc to the handler list, these are done symetrically (i.e. in all instances)
-;; so that the proc can be dereferenced remotely
-;;
-(define (register-handler udata key proc)
-  (hash-table-set! (udat-handlers udata) key proc))
-
-
-;;======================================================================
-;; work queues
-;;======================================================================
-
-(define (add-to-work-queue udata peer-dat handlerkey qrykey data)
-  (let ((wdat (make-work peer-dat: peer-dat handlerkey: handlerkey qrykey: qrykey data: data)))
-    (if (udat-busy udata)
-	(queue-add! (udat-work-queue udata) wdat)
-	(process-work udata wdat)) ;; passing in wdat tells process-work to first process the passed in wdat
-    ))
-
-(define (do-work udata wdat)
-  #f)
-
-(define (process-work udata #!optional wdat)
-  (if wdat (do-work udata wdat)) ;; process wdat
-  (let ((wqueue (udat-work-queue udata)))
-    (if (not (queue-empty? wqueue))
-	(let loop ((wd (queue-remove! wqueue)))
-	  (do-work udata wd)
-	  (if (not (queue-empty? wqueue))
-	      (loop (queue-remove! wqueue)))))))
-
-;;======================================================================
-;; Generic db handling
-;;   setup a inmem db instance
-;;   open connection to on-disk db
-;;   sync on-disk db to inmem
-;;   get lock in on-disk db for dbowner of this db
-;;   put sync-proc, init-proc, on-disk handle, inmem handle in dbconn stuct
-;;   return the stuct
-;;======================================================================
-
-(defstruct dbconn
-  (fname  #f)
-  (inmem  #f)
-  (conn   #f)
-  (sync   #f) ;; sync proc
-  (init   #f) ;; init proc
-  (lastsync (current-seconds))
-  )
-
-(defstruct dbinfo
-  (initproc #f)
-  (syncproc #f))
-
-;; open inmem and disk database
-;;   init with initproc
-;;   return db struct
-;;
-;;   appname; megatest, ulex or something else.
-;;
-(define (setup-db-connection udata fname-in appname dbtype)
-  (let* ((is-ulex (eq? appname 'ulex))
-	 (dbinf   (if is-ulex ;; ulex is a built-in special case
-		      (make-dbinfo initproc: ulexdb-init syncproc: ulexdb-sync)
-		      (hash-table-ref/default (udat-dbtypes udata) dbtype #f)))
-	 (initproc (dbinfo-initproc dbinf))
-	 (syncproc (dbinfo-syncproc dbinf))
-	 (fname   (if is-ulex
-		      (conc (udat-ulex-dir udata) "/ulex.db")
-		      fname-in))
-	 (inmem-db (open-and-initdb udata #f 'inmem (dbinfo-initproc dbinf)))
-	 (disk-db  (open-and-initdb udata fname 'disk (dbinfo-initproc dbinf))))
-    (make-dbconn inmem: inmem-db conn: disk-db sync: syncproc init: initproc)))
-
-;; dest='inmem or 'disk
-;;
-(define (open-and-initdb udata filename dest init-proc)
-  (let* ((inmem    (eq? dest 'inmem))
-	 (dbfile   (if inmem
-		       ":INMEM:"
-		       filename))
-	 (dbexists (if inmem #t (file-exists? dbfile)))
-	 (db       (sqlite3:open-database dbfile)))
-    (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))
-    (if (not dbexists)
-	(init-proc db))
-    db))
-
-
-;;======================================================================
-;; Previous Ulex db stuff
-;;======================================================================
-
-(define (ulexdb-init db inmem)
-  (sqlite3:with-transaction
-   db
-   (lambda ()
-     (for-each
-      (lambda (stmt)
-	(if stmt (sqlite3:execute db stmt)))
-      `("CREATE TABLE IF NOT EXISTS processes 
-                 (id INTEGER PRIMARY KEY,
-                  host  TEXT NOT NULL,
-                  ipadr TEXT NOT NULL,
-                  port  INTEGER NOT NULL,
-                  pid   INTEGER NOT NULL,
-                  regtime INTEGER DEFAULT (strftime('%s','now')),
-                  last_update INTEGER DEFAULT (strftime('%s','now')));"
-	(if inmem
-	    "CREATE TRIGGER  IF NOT EXISTS update_proces_trigger AFTER UPDATE ON processes
-                             FOR EACH ROW
-                               BEGIN 
-                                 UPDATE processes SET last_update=(strftime('%s','now'))
-                                   WHERE id=old.id;
-                               END;"
-	    #f))))))
-
-;; open databases, do initial sync
-(define (ulexdb-sync dbconndat udata)
-  #f)
-
-
-) ;; END OF ULEX
-
-
-;;; ;;======================================================================
-;;; ;; D E B U G   H E L P E R S
-;;; ;;======================================================================
-;;;     
-;;; (define (dbg> . args)
-;;;   (with-output-to-port (current-error-port)
-;;;     (lambda ()
-;;;       (apply print "dbg> " args))))
-;;; 
-;;; (define (debug-pp . args)
-;;;   (if (get-environment-variable "ULEX_DEBUG")
-;;;       (with-output-to-port (current-error-port)
-;;; 	(lambda ()
-;;; 	  (apply pp args)))))
-;;; 
-;;; (define *default-debug-port* (current-error-port))
-;;; 
-;;; (define (sdbg> fn stage-name stage-start stage-end start-time . message)
-;;;   (if (get-environment-variable "ULEX_DEBUG")
-;;;       (with-output-to-port *default-debug-port* 
-;;; 	(lambda ()
-;;; 	  (apply print "ulex:" fn " " stage-name " took " (- (if stage-end stage-end (current-milliseconds)) stage-start) " ms. "
-;;; 		 (if start-time
-;;; 		     (conc "total time " (- (current-milliseconds) start-time)
-;;; 			   " ms.")
-;;; 		     "")
-;;; 		 message
-;;; 		 )))))
-
-;;======================================================================
-;; M A C R O S
-;;======================================================================
-;; iup callbacks are not dumping the stack, this is a work-around
-;;
-
-;; Some of these routines use:
-;;
-;;     http://www.cs.toronto.edu/~gfb/scheme/simple-macros.html
-;;
-;; Syntax for defining macros in a simple style similar to function definiton,
-;;  when there is a single pattern for the argument list and there are no keywords.
-;;
-;; (define-simple-syntax (name arg ...) body ...)
-;;
-;; 
-;; (define-syntax define-simple-syntax
-;;   (syntax-rules ()
-;;     ((_ (name arg ...) body ...)
-;;      (define-syntax name (syntax-rules () ((name arg ...) (begin body ...)))))))
-;; 
-;; (define-simple-syntax (catch-and-dump proc procname)
-;;   (handle-exceptions
-;;    exn
-;;    (begin
-;;      (print-call-chain (current-error-port))
-;;      (with-output-to-port (current-error-port)
-;;        (lambda ()
-;;          (print ((condition-property-accessor 'exn 'message) exn))
-;;          (print "Callback error in " procname)
-;;          (print "Full condition info:\n" (condition->list exn)))))
-;;    (proc)))
-;; 
-;; 
-;;======================================================================
-;;  R E C O R D S
-;;======================================================================
-
-;;; ;; information about me as a server
-;;; ;;
-;;; (defstruct area
-;;;   ;; about this area
-;;;   (useportlogger #f)
-;;;   (lowport       32768)
-;;;   (server-type   'auto)  ;; auto=create up to five servers/pkts, main=create pkts, passive=no pkt (unless there are no pkts at all)
-;;;   (conn          #f)
-;;;   (port          #f)
-;;;   (myaddr        (get-my-best-address))
-;;;   pktid          ;; get pkt from hosts table if needed
-;;;   pktfile
-;;;   pktsdir
-;;;   dbdir
-;;;   (dbhandles     (make-hash-table)) ;; fname => list-of-dbh, NOTE: Should really never need more than one?
-;;;   (mutex         (make-mutex))
-;;;   (rtable        (make-hash-table)) ;; registration table of available actions
-;;;   (dbs           (make-hash-table)) ;; filename => random number, used for choosing what dbs I serve
-;;;   ;; about other servers
-;;;   (hosts         (make-hash-table)) ;; key => hostdat
-;;;   (hoststats     (make-hash-table)) ;; key => alist of fname => ( qcount . qtime )
-;;;   (reqs          (make-hash-table)) ;; uri => queue
-;;;   ;; work queues
-;;;   (wqueues       (make-hash-table)) ;; fname => qdat
-;;;   (stats         (make-hash-table)) ;; fname => totalqueries
-;;;   (last-srvup    (current-seconds)) ;; last time we updated the known servers
-;;;   (cookie2mbox   (make-hash-table)) ;; map cookie for outstanding request to mailbox of awaiting call
-;;;   (ready #f)
-;;;   (health        (make-hash-table)) ;; ipaddr:port => num failed pings since last good ping
-;;;   )
-;;; 
-;;; ;; host stats
-;;; ;;
-;;; (defstruct hostdat
-;;;   (pkt      #f)
-;;;   (dbload   (make-hash-table))  ;; "dbfile.db" => queries/min
-;;;   (hostload #f)                 ;; normalized load ( 5min load / numcpus )
-;;;   )
-;;; 
-;;; ;; dbdat
-;;; ;;
-;;; (defstruct dbdat
-;;;   (dbh    #f)
-;;;   (fname  #f)
-;;;   (write-access #f)
-;;;   (sths   (make-hash-table))  ;; hash mapping query strings to handles
-;;;   )
-;;; 
-;;; ;; qdat
-;;; ;;
-;;; (defstruct qdat
-;;;   (writeq  (make-queue))
-;;;   (readq   (make-queue))
-;;;   (rwq     (make-queue))
-;;;   (logq    (make-queue)) ;; do we need a queue for logging? yes, if we use sqlite3 db for logging
-;;;   (osshort (make-queue))
-;;;   (oslong  (make-queue))
-;;;   (misc    (make-queue)) ;; used for things like ping-full
-;;;   )
-;;; 
-;;; ;; calldat
-;;; ;;
-;;; (defstruct calldat
-;;;   (ctype 'dbwrite)
-;;;   (obj   #f)              ;; this would normally be an SQL statement e.g. SELECT, INSERT etc.
-;;;   (rtime (current-milliseconds)))
-;;; 
-;;; ;; make it a global? Well, it is local to area module
-;;; 
-;;; (define *pktspec*
-;;;   `((server (hostname . h)
-;;; 	    (port     . p)
-;;; 	    (pid      . i)
-;;; 	    (ipaddr   . a)
-;;; 	    )
-;;;     (data   (hostname . h)  ;; sender hostname
-;;; 	    (port     . p)  ;; sender port
-;;; 	    (ipaddr   . a)  ;; sender ip
-;;; 	    (hostkey  . k)  ;; sending host key - store info at server under this key
-;;; 	    (servkey  . s)  ;; server key - this needs to match at server end or reject the msg
-;;; 	    (format   . f)  ;; sb=serialized-base64, t=text, sx=sexpr, j=json
-;;; 	    (data     . d)  ;; base64 encoded slln data
-;;; 	    )))
-;;; 
-;;; ;; work item
-;;; ;;
-;;; (defstruct witem
-;;;   (rhost #f)   ;; return host
-;;;   (ripaddr #f) ;; return ipaddr
-;;;   (rport #f)   ;; return port
-;;;   (servkey #f) ;; the packet representing the client of this workitem, used by final send-message
-;;;   (rdat  #f)   ;; the request - usually an sql query, type is rdat
-;;;   (action #f)  ;; the action: immediate, dbwrite, dbread,oslong, osshort
-;;;   (cookie #f)  ;; cookie id for response
-;;;   (data   #f)  ;; the data payload, i.e. parameters
-;;;   (result #f)  ;; the result from processing the data
-;;;   (caller #f)) ;; the calling peer according to rpc itself
-;;; 
-;;; (define (trim-pktid pktid)
-;;;   (if (string? pktid)
-;;;       (substring pktid 0 4)
-;;;       "nopkt"))
-;;; 
-;;; (define (any->number num)
-;;;   (cond
-;;;    ((number? num) num)
-;;;    ((string? num) (string->number num))
-;;;    (else num)))
-;;; 
-;;; (use trace)
-;;; (trace-call-sites #t)
-;;; 
-;;; ;;======================================================================
-;;; ;; D A T A B A S E   H A N D L I N G 
-;;; ;;======================================================================
-;;; 
-;;; ;; look in dbhandles for a db, return it, else return #f
-;;; ;;
-;;; (define (get-dbh acfg fname)
-;;;   (let ((dbh-lst (hash-table-ref/default (area-dbhandles acfg) fname '())))
-;;;     (if (null? dbh-lst)
-;;; 	(begin
-;;; 	  ;; (print "opening db for " fname)
-;;; 	  (open-db acfg fname)) ;; Note that the handles get put back in the queue in the save-dbh calls
-;;; 	(let ((rem-lst (cdr dbh-lst)))
-;;; 	  ;; (print "re-using saved connection for " fname)
-;;; 	  (hash-table-set! (area-dbhandles acfg) fname rem-lst)
-;;; 	  (car dbh-lst)))))
-;;; 
-;;; (define (save-dbh acfg fname dbdat)
-;;;     ;; (print "saving dbh for " fname)
-;;;     (hash-table-set! (area-dbhandles acfg) fname (cons dbdat (hash-table-ref/default (area-dbhandles acfg) fname '()))))
-;;; 
-;;; ;; open the database, if never before opened init it. put the handle in the
-;;; ;; open db's hash table
-;;; ;; returns: the dbdat
-;;; ;;
-;;; (define (open-db acfg fname)
-;;;   (let* ((fullname     (conc (area-dbdir acfg) "/" fname))
-;;; 	 (exists       (file-exists? fullname))
-;;; 	 (write-access (if exists
-;;; 			   (file-write-access? fullname)
-;;; 			   (file-write-access? (area-dbdir acfg))))
-;;; 	 (db           (sqlite3:open-database fullname))
-;;; 	 (handler      (sqlite3:make-busy-timeout 136000))
-;;; 	 )
-;;;     (sqlite3:set-busy-handler! db handler)
-;;;     (sqlite3:execute db "PRAGMA synchronous = 0;")
-;;;     (if (not exists) ;; need to init the db
-;;; 	(if write-access
-;;; 	    (let ((isql (get-rsql acfg 'dbinitsql))) ;; get the init sql statements
-;;; 	      ;; (sqlite3:with-transaction
-;;; 	      ;;  db
-;;; 	      ;;  (lambda ()
-;;; 		 (if isql
-;;; 		     (for-each
-;;; 		      (lambda (sql)
-;;; 			(sqlite3:execute db sql))
-;;; 		      isql)))
-;;; 	    (print "ERROR: no write access to " (area-dbdir acfg))))
-;;;     (make-dbdat dbh: db fname: fname write-access: write-access)))
-;;; 
-;;; ;; This is a low-level command to retrieve or to prepare, save and return a prepared statment
-;;; ;; you must extract the db handle
-;;; ;;
-;;; (define (get-sth db cache stmt)
-;;;   (if (hash-table-exists? cache stmt)
-;;;       (begin
-;;; 	;; (print "Reusing cached stmt for " stmt)
-;;; 	(hash-table-ref/default cache stmt #f))
-;;;       (let ((sth (sqlite3:prepare db stmt)))
-;;; 	(hash-table-set! cache stmt sth)
-;;; 	;; (print "prepared stmt for " stmt)
-;;; 	sth)))
-;;; 
-;;; ;; a little more expensive but does all the tedious deferencing - only use if you don't already
-;;; ;; have dbdat and db sitting around
-;;; ;;
-;;; (define (full-get-sth acfg fname stmt)
-;;;   (let* ((dbdat  (get-dbh acfg fname))
-;;; 	 (db     (dbdat-dbh dbdat))
-;;; 	 (sths   (dbdat-sths dbdat)))
-;;;     (get-sth db sths stmt)))
-;;; 
-;;; ;; write to a db
-;;; ;; acfg: area data
-;;; ;; rdat: request data
-;;; ;; hdat: (host . port)
-;;; ;;
-;;; ;; (define (dbwrite acfg rdat hdat data-in)
-;;; ;;   (let* ((dbname (car data-in))
-;;; ;; 	 (dbdat  (get-dbh acfg dbname))
-;;; ;; 	 (db     (dbdat-dbh dbdat))
-;;; ;; 	 (sths   (dbdat-sths dbdat))
-;;; ;; 	 (stmt   (calldat-obj rdat))
-;;; ;; 	 (sth    (get-sth db sths stmt))
-;;; ;; 	 (data   (cdr data-in)))
-;;; ;;     (print "dbname: " dbname " acfg: " acfg " rdat: " (calldat->alist rdat) " hdat: " hdat " data: " data)
-;;; ;;     (print "dbdat: " (dbdat->alist dbdat))
-;;; ;;     (apply sqlite3:execute sth data)
-;;; ;;     (save-dbh acfg dbname dbdat)
-;;; ;;     #t
-;;; ;;     ))
-;;; 
-;;; (define (finalize-all-db-handles acfg)
-;;;   (let* ((dbhandles (area-dbhandles acfg))  ;; dbhandles is hash of fname ==> dbdat
-;;; 	 (num       0))
-;;;     (for-each
-;;;      (lambda (area-name)
-;;;        (print "Closing handles for " area-name)
-;;;        (let ((dbdats (hash-table-ref/default dbhandles area-name '())))
-;;; 	 (for-each
-;;; 	  (lambda (dbdat)
-;;; 	    ;; first close all statement handles
-;;; 	    (for-each
-;;; 	     (lambda (sth)
-;;; 	       (sqlite3:finalize! sth)
-;;; 	       (set! num (+ num 1)))
-;;; 	     (hash-table-values (dbdat-sths dbdat)))
-;;; 	    ;; now close the dbh
-;;; 	    (set! num (+ num 1))
-;;; 	    (sqlite3:finalize! (dbdat-dbh dbdat)))
-;;; 	  dbdats)))
-;;;      (hash-table-keys dbhandles))
-;;;     (print "FINALIZED " num " dbhandles")))
-;;; 
-;;; ;;======================================================================
-;;; ;; W O R K   Q U E U E   H A N D L I N G 
-;;; ;;======================================================================
-;;; 
-;;; (define (register-db-as-mine acfg dbname)
-;;;   (let ((ht (area-dbs acfg)))
-;;;     (if (not (hash-table-ref/default ht dbname #f))
-;;; 	(hash-table-set! ht dbname (random 10000)))))
-;;; 	
-;;; (define (work-queue-add acfg fname witem)
-;;;   (let* ((work-queue-start (current-milliseconds))
-;;; 	 (action           (witem-action witem)) ;; NB the action is the index into the rdat actions
-;;; 	 (qdat             (or (hash-table-ref/default (area-wqueues acfg) fname #f)
-;;; 			       (let ((newqdat (make-qdat)))
-;;; 				 (hash-table-set! (area-wqueues acfg) fname newqdat)
-;;; 				 newqdat)))
-;;; 	 (rdat             (hash-table-ref/default (area-rtable acfg) action #f)))
-;;;     (if rdat
-;;; 	(queue-add!
-;;; 	 (case (calldat-ctype rdat)
-;;; 	   ((dbwrite)   (register-db-as-mine acfg fname)(qdat-writeq qdat))
-;;; 	   ((dbread)    (register-db-as-mine acfg fname)(qdat-readq  qdat))
-;;; 	   ((dbrw)      (register-db-as-mine acfg fname)(qdat-rwq    qdat))
-;;; 	   ((oslong)    (qdat-oslong qdat))
-;;; 	   ((osshort)   (qdat-osshort qdat))
-;;; 	   ((full-ping) (qdat-misc  qdat))
-;;; 	   (else
-;;; 	    (print "ERROR: no queue for " action ". Adding to dbwrite queue.")
-;;; 	    (qdat-writeq qdat)))
-;;; 	 witem)
-;;; 	(case action
-;;; 	  ((full-ping)(qdat-misc qdat))
-;;; 	  (else
-;;; 	   (print "ERROR: No action " action " was registered"))))
-;;;     (sdbg> "work-queue-add" "queue-add" work-queue-start #f #f)
-;;;     #t)) ;; for now, simply return #t to indicate request got to the queue
-;;; 
-;;; (define (doqueue acfg q fname dbdat dbh)
-;;;   ;; (print "doqueue: " fname)
-;;;   (let* ((start-time (current-milliseconds))
-;;; 	 (qlen       (queue-length q)))
-;;;     (if (> qlen 1)
-;;; 	(print "Processing queue of length " qlen))
-;;;     (let loop ((count      0)
-;;; 	       (responses '()))
-;;;       (let ((delta (- (current-milliseconds) start-time)))
-;;; 	(if (or (queue-empty? q)
-;;; 		(> delta 400)) ;; stop working on this queue after 400ms have passed
-;;; 	    (list count delta responses) ;; return count, delta and responses list
-;;; 	    (let* ((witem  (queue-remove! q))
-;;; 		   (action (witem-action witem))
-;;; 		   (rdat   (witem-rdat   witem))
-;;; 		   (stmt   (calldat-obj rdat))
-;;; 		   (sth    (full-get-sth acfg fname stmt))
-;;; 		   (ctype  (calldat-ctype rdat))
-;;; 		   (data   (witem-data   witem))
-;;; 		   (cookie (witem-cookie witem)))
-;;; 	      ;; do the processing and save the result in witem-result
-;;; 	      (witem-result-set!
-;;; 	       witem
-;;; 	       (case ctype ;; action
-;;; 		 ((noblockwrite) ;; blind write, no ack of success returned
-;;; 		  (apply sqlite3:execute sth data)
-;;; 		  (sqlite3:last-insert-rowid dbh))
-;;; 		 ((dbwrite)      ;; blocking write   
-;;; 		  (apply sqlite3:execute sth data)
-;;; 		  #t)
-;;; 		 ((dbread) ;; TODO: consider breaking this up and shipping in pieces for large query
-;;; 		  (apply sqlite3:map-row (lambda x x) sth data))
-;;; 		 ((full-ping)  'full-ping)
-;;; 		 (else (print "Not ready for action " action) #f)))
-;;; 	      (loop (add1 count)
-;;; 		    (if cookie
-;;; 			(cons witem responses)
-;;; 			responses))))))))
-;;; 
-;;; ;; do up to 400ms of processing on each queue
-;;; ;; - the work-queue-processor will allow the max 1200ms of work to complete but it will flag as overloaded
-;;; ;; 
-;;; (define (process-db-queries acfg fname)
-;;;   (if (hash-table-exists? (area-wqueues acfg) fname)
-;;;       (let* ((process-db-queries-start-time (current-milliseconds))
-;;; 	     (qdat             (hash-table-ref/default (area-wqueues acfg) fname #f))
-;;; 	     (queue-sym->queue (lambda (queue-sym)
-;;; 				 (case queue-sym  ;; lookup the queue from qdat given a name (symbol)
-;;; 				   ((wqueue)  (qdat-writeq qdat))
-;;; 				   ((rqueue)  (qdat-readq  qdat))
-;;; 				   ((rwqueue) (qdat-rwq    qdat))
-;;; 				   ((misc)    (qdat-misc   qdat))
-;;; 				   (else #f))))
-;;; 	     (dbdat   (get-dbh acfg fname))
-;;; 	     (dbh     (if (dbdat? dbdat)(dbdat-dbh dbdat) #f))
-;;; 	     (nowtime (current-seconds)))
-;;; 	;; handle the queues that require a transaction
-;;; 	;;
-;;; 	(map ;; 
-;;; 	 (lambda (queue-sym)
-;;; 	   ;; (print "processing queue " queue-sym)
-;;; 	   (let* ((queue (queue-sym->queue queue-sym)))
-;;; 	     (if (not (queue-empty? queue))
-;;; 		 (let ((responses
-;;; 			(sqlite3:with-transaction ;; todo - catch exceptions...
-;;; 			 dbh
-;;; 			 (lambda ()
-;;; 			   (let* ((res (doqueue acfg queue fname dbdat dbh))) ;; this does the work!
-;;; 			     ;; (print "res=" res)
-;;; 			     (match res
-;;; 			      ((count delta responses)
-;;; 			       (update-stats acfg fname queue-sym delta count)
-;;; 			       (sdbg> "process-db-queries" "sqlite3-transaction" process-db-queries-start-time #f #f)
-;;; 			       responses) ;; return responses
-;;; 			      (else
-;;; 			       (print "ERROR: bad return data from doqueue " res)))
-;;; 			     )))))
-;;; 		   ;; having completed the transaction, send the responses.
-;;; 		   ;; (print "INFO: sending " (length responses) " responses.")
-;;; 		   (let loop ((responses-left responses))
-;;; 		     (cond
-;;; 		      ((null? responses-left)  #t)
-;;; 		      (else
-;;; 		       (let* ((witem    (car responses-left))
-;;; 			      (response (cdr responses-left)))  
-;;; 			 (call-deliver-response acfg (witem-ripaddr witem)(witem-rport witem)
-;;; 						(witem-cookie witem)(witem-result witem)))
-;;; 		       (loop (cdr responses-left))))))
-;;; 		 )))
-;;; 	 '(wqueue rwqueue rqueue))
-;;; 	
-;;; 	;; handle misc queue
-;;; 	;;
-;;; 	;; (print "processing misc queue")
-;;; 	(let ((queue (queue-sym->queue 'misc)))
-;;; 	  (doqueue acfg queue fname dbdat dbh))
-;;; 	;; ....
-;;; 	(save-dbh acfg fname dbdat)
-;;; 	#t ;; just to let the tests know we got here
-;;; 	)
-;;;       #f ;; nothing processed
-;;;       ))
-;;; 
-;;; ;; run all queues in parallel per db but sequentially per queue for that db.
-;;; ;;  - process the queues every 500 or so ms
-;;; ;;  - allow for long running queries to continue but all other activities for that
-;;; ;;    db will be blocked.
-;;; ;;
-;;; (define (work-queue-processor acfg)
-;;;   (let* ((threads (make-hash-table))) ;; fname => thread
-;;;     (let loop ((fnames      (hash-table-keys (area-wqueues acfg)))
-;;; 	       (target-time (+ (current-milliseconds) 50)))
-;;;       ;;(if (not (null? fnames))(print "Processing for these databases: " fnames))
-;;;       (for-each
-;;;        (lambda (fname)
-;;; 	 ;; (print "processing for " fname)
-;;; 	 ;;(process-db-queries acfg fname))
-;;; 	 (let ((th (hash-table-ref/default threads fname #f)))
-;;; 	   (if (and th (not (member (thread-state th) '(dead terminated))))
-;;; 	       (begin
-;;; 		 (print "WARNING: worker thread for " fname " is taking a long time.")
-;;; 		 (print "Thread is in state " (thread-state th)))
-;;; 	       (let ((th1 (make-thread (lambda ()
-;;; 					 ;; (catch-and-dump
-;;; 					 ;;  (lambda ()
-;;; 					    ;; (print "Process queries for " fname)
-;;; 					    (let ((start-time (current-milliseconds)))
-;;; 					      (process-db-queries acfg fname)
-;;; 					      ;; (thread-sleep! 0.01) ;; need the thread to take at least some time
-;;; 					      (hash-table-delete! threads fname)) ;; no mutexes?
-;;; 					    fname)
-;;; 					  "th1"))) ;; ))
-;;; 		 (hash-table-set! threads fname th1)
-;;; 		 (thread-start! th1)))))
-;;;        fnames)
-;;;       ;; (thread-sleep! 0.1) ;; give the threads some time to process requests
-;;;       ;; burn time until 400ms is up
-;;;       (let ((now-time (current-milliseconds)))
-;;; 	(if (< now-time target-time)
-;;; 	    (let ((delta (- target-time now-time)))
-;;; 	      (thread-sleep! (/ delta 1000)))))
-;;;       (loop (hash-table-keys (area-wqueues acfg))
-;;; 	    (+ (current-milliseconds) 50)))))
-;;; 
-;;; ;;======================================================================
-;;; ;; S T A T S   G A T H E R I N G
-;;; ;;======================================================================
-;;; 
-;;; (defstruct stat
-;;;   (qcount-avg  0)                  ;; coarse running average
-;;;   (qtime-avg   0)                  ;; coarse running average
-;;;   (qcount      0)                  ;; total
-;;;   (qtime       0)                  ;; total
-;;;   (last-qcount 0)                  ;; last 
-;;;   (last-qtime  0)                  ;; last
-;;;   (dbs        '())                 ;; list of db files handled by this node
-;;;   (when        0))                 ;; when the last query happened - seconds
-;;; 
-;;; 
-;;; (define (update-stats acfg fname bucket duration numqueries)
-;;;   (let* ((key   fname) ;; for now do not use bucket. Was: (conc fname "-" bucket)) ;; lazy but good enough
-;;; 	 (stats (or (hash-table-ref/default (area-stats acfg) key #f)
-;;; 		    (let ((newstats (make-stat)))
-;;; 		      (hash-table-set! (area-stats acfg) key newstats)
-;;; 		      newstats))))
-;;;     ;; when the last query happended (used to remove the fname from the active list)
-;;;     (stat-when-set! stats (current-seconds))
-;;;     ;; last values
-;;;     (stat-last-qcount-set! stats numqueries)
-;;;     (stat-last-qtime-set!  stats duration)
-;;;     ;; total over process lifetime
-;;;     (stat-qcount-set! stats (+ (stat-qcount stats) numqueries))
-;;;     (stat-qtime-set!  stats (+ (stat-qtime  stats) duration))
-;;;     ;; coarse average
-;;;     (stat-qcount-avg-set! stats (/ (+ (stat-qcount-avg stats) numqueries) 2))
-;;;     (stat-qtime-avg-set!  stats (/ (+ (stat-qtime-avg  stats) duration)   2))
-;;; 
-;;;     ;; here is where we add the stats for a given dbfile
-;;;     (if (not (member fname (stat-dbs stats)))
-;;; 	(stat-dbs-set! stats (cons fname (stat-dbs stats))))
-;;; 
-;;;     ))
-;;; 
-;;; ;;======================================================================
-;;; ;; S E R V E R   S T U F F 
-;;; ;;======================================================================
-;;; 
-;;; ;; this does NOT return!
-;;; ;;
-;;; (define (find-free-port-and-open acfg)
-;;;   (let ((port (or (area-port acfg) 3200)))
-;;;     (handle-exceptions
-;;; 	exn
-;;; 	(begin
-;;; 	  (print "INFO: cannot bind to port " (rpc:default-server-port) ", trying next port")
-;;; 	  (area-port-set! acfg (+ port 1))
-;;; 	  (find-free-port-and-open acfg))
-;;;       (rpc:default-server-port port)
-;;;       (area-port-set! acfg port)
-;;;       (tcp-read-timeout 120000)
-;;;       ;; ((rpc:make-server (tcp-listen port)) #t)
-;;;       (tcp-listen (rpc:default-server-port)
-;;;       ))))
-;;; 
-;;; ;; register this node by putting a packet into the pkts dir.
-;;; ;; look for other servers
-;;; ;; contact other servers and compile list of servers
-;;; ;; there are two types of server
-;;; ;;     main servers - dashboards, runners and dedicated servers - need pkt
-;;; ;;     passive servers - test executers, step calls, list-runs - no pkt
-;;; ;;
-;;; (define (register-node acfg hostip port-num)
-;;;   ;;(mutex-lock! (area-mutex acfg))
-;;;   (let* ((server-type  (area-server-type acfg)) ;; auto, main, passive (no pkt created)
-;;; 	 (best-ip      (or hostip (get-my-best-address)))
-;;; 	 (mtdir        (area-dbdir acfg))
-;;; 	 (pktdir       (area-pktsdir acfg))) ;; conc mtdir "/.server-pkts")))
-;;;     (print "Registering node " best-ip ":" port-num)
-;;;     (if (not mtdir) ;; require a home for this node to put or find databases
-;;; 	#f
-;;; 	(begin
-;;; 	  (if  (not (directory? pktdir))(create-directory pktdir))
-;;; 	  ;; server is started, now create pkt if needed
-;;; 	  (print "Starting server in " server-type " mode with port " port-num)
-;;; 	  (if (member server-type '(auto main)) ;; TODO: if auto, count number of servers registers, if > 3 then don't put out a pkt
-;;; 	      (begin
-;;; 		(area-pktid-set! acfg
-;;; 				 (write-alist->pkt
-;;; 				  pktdir 
-;;; 				  `((hostname . ,(get-host-name))
-;;; 				    (ipaddr   . ,best-ip)
-;;; 				    (port     . ,port-num)
-;;; 				    (pid      . ,(current-process-id)))
-;;; 				  pktspec: *pktspec*
-;;; 				  ptype:   'server))
-;;; 		(area-pktfile-set! acfg (conc pktdir "/" (area-pktid acfg) ".pkt"))))
-;;; 	  (area-port-set!    acfg port-num)
-;;; 	  #;(mutex-unlock! (area-mutex acfg))))))
-;;; 
-;;; (define *cookie-seqnum* 0)
-;;; (define (make-cookie key)
-;;;   (set! *cookie-seqnum* (add1 *cookie-seqnum*))
-;;;   ;;(print "MAKE COOKIE CALLED -- on "servkey"-"*cookie-seqnum*)
-;;;   (conc key "-" *cookie-seqnum*)
-;;;   )
-;;; 
-;;; ;; dispatch locally if possible
-;;; ;;
-;;; (define (call-deliver-response acfg ipaddr port cookie data)
-;;;   (if (and (equal? (area-myaddr acfg) ipaddr)
-;;; 	   (equal? (area-port     acfg) port))
-;;;       (deliver-response acfg cookie data)
-;;;       ((rpc:procedure 'response ipaddr port) cookie data)))
-;;; 
-;;; (define (deliver-response acfg cookie data)
-;;;   (let ((deliver-response-start (current-milliseconds)))
-;;;     (thread-start! (make-thread
-;;; 		    (lambda ()
-;;; 		      (let loop ((tries-left 5))
-;;; 			;;(print "TOP OF DELIVER_RESPONSE LOOP; triesleft="tries-left)
-;;; 			;;(pp (hash-table->alist (area-cookie2mbox acfg)))
-;;; 			(let* ((mbox (hash-table-ref/default (area-cookie2mbox acfg) cookie #f)))
-;;; 			  (cond
-;;; 			   ((eq? 0 tries-left)
-;;; 			    (print "ulex:deliver-response: I give up. Mailbox never appeared. cookie="cookie)
-;;; 			    )
-;;; 			   (mbox
-;;; 			    ;;(print "got mbox="mbox"  got data="data"  send.")
-;;; 			    (mailbox-send! mbox data))
-;;; 			   (else
-;;; 			    ;;(print "no mbox yet.  look for "cookie)
-;;; 			    (thread-sleep! (/ (- 6 tries-left) 10))
-;;; 			    (loop (sub1 tries-left))))))
-;;; 		      ;; (debug-pp (list (conc "ulex:deliver-response took " (- (current-milliseconds) deliver-response-start) " ms, cookie=" cookie " data=") data))
-;;; 		      (sdbg> "deliver-response" "mailbox-send" deliver-response-start #f #f cookie)
-;;; 		      )
-;;; 		    (conc "deliver-response thread for cookie="cookie))))
-;;;   #t)
-;;; 
-;;; ;; action:
-;;; ;;   immediate - quick actions, no need to put in queues
-;;; ;;   dbwrite   - put in dbwrite queue
-;;; ;;   dbread    - put in dbread queue
-;;; ;;   oslong    - os actions, e.g. du, that could take a long time
-;;; ;;   osshort   - os actions that should be quick, e.g. df
-;;; ;;
-;;; (define (request acfg from-ipaddr from-port servkey action cookie fname params) ;; std-peer-handler
-;;;   ;; NOTE: Use rpc:current-peer for getting return address
-;;;   (let* ((std-peer-handler-start (current-milliseconds))
-;;; 	 ;; (raw-data               (alist-ref 'data     dat))
-;;; 	 (rdat                   (hash-table-ref/default
-;;; 				  (area-rtable acfg) action #f)) ;; this looks up the sql query or other details indexed by the action
-;;; 	 (witem                  (make-witem ripaddr: from-ipaddr ;; rhost:   from-host   
-;;; 					     rport:   from-port   action:  action
-;;; 					     rdat:    rdat        cookie:  cookie
-;;; 					     servkey: servkey     data:    params ;; TODO - rename data to params
-;;; 					     caller:  (rpc:current-peer))))
-;;;     (if (not (equal? servkey (area-pktid acfg)))
-;;; 	`(#f . ,(conc "I don't know you servkey=" servkey ", pktid=" (area-pktid acfg))) ;; immediately return this
-;;; 	(let* ((ctype (if rdat 
-;;; 			  (calldat-ctype rdat) ;; is this necessary? these should be identical
-;;; 			  action)))
-;;; 	  (sdbg> "std-peer-handler" "immediate" std-peer-handler-start #f #f)
-;;; 	  (case ctype
-;;; 	    ;; (dbwrite acfg rdat (cons from-ipaddr from-port) data)))
-;;; 	    ((full-ping)  `(#t  "ack to full ping"        ,(work-queue-add acfg fname witem) ,cookie))
-;;; 	    ((response)   `(#t  "ack from requestor"      ,(deliver-response acfg fname params)))
-;;; 	    ((dbwrite)    `(#t  "db write submitted"      ,(work-queue-add acfg fname witem) ,cookie))
-;;; 	    ((dbread)     `(#t  "db read submitted"       ,(work-queue-add acfg fname witem) ,cookie  ))
-;;; 	    ((dbrw)       `(#t  "db read/write submitted" ,cookie))
-;;; 	    ((osshort)    `(#t  "os short submitted"      ,cookie))
-;;; 	    ((oslong)     `(#t  "os long submitted"       ,cookie))
-;;; 	    (else         `(#f  "unrecognised action"     ,ctype)))))))
-;;; 
-;;; ;; Call this to start the actual server
-;;; ;;
-;;; ;; start_server
-;;; ;;
-;;; ;;   mode: '
-;;; ;;   handler: proc which takes pktrecieved as argument
-;;; ;;
-;;; 
-;;; (define (start-server acfg)
-;;;   (let* ((conn (find-free-port-and-open acfg))
-;;; 	 (port (area-port acfg)))
-;;;     (rpc:publish-procedure!
-;;;      'delist-db
-;;;      (lambda (fname)
-;;;        (hash-table-delete! (area-dbs acfg) fname)))
-;;;     (rpc:publish-procedure!
-;;;      'calling-addr
-;;;      (lambda ()
-;;;        (rpc:current-peer)))
-;;;     (rpc:publish-procedure!
-;;;      'ping
-;;;      (lambda ()(real-ping acfg)))
-;;;     (rpc:publish-procedure!
-;;;      'request
-;;;      (lambda (from-addr from-port servkey action cookie dbname params)
-;;;        (request acfg from-addr from-port servkey action cookie dbname params)))
-;;;     (rpc:publish-procedure!
-;;;      'response
-;;;      (lambda (cookie res-dat)
-;;;        (deliver-response acfg cookie res-dat)))
-;;;     (area-ready-set! acfg #t)
-;;;     (area-conn-set! acfg conn)
-;;;     ((rpc:make-server conn) #f)));; ((tcp-listen (rpc:default-server-port)) #t)
-;;; 
-;;; 
-;;; (define (launch acfg) ;;  #!optional (proc std-peer-handler))
-;;;   (print "starting launch")
-;;;   (update-known-servers acfg) ;; gotta do this on every start (thus why limit number of publicised servers)
-;;;   #;(let ((original-handler (current-exception-handler))) ;; is th
-;;;     (lambda (exception)
-;;;       (server-exit-procedure)
-;;;       (original-handler exception)))
-;;;   (on-exit (lambda ()
-;;; 	     (shutdown acfg))) ;; (finalize-all-db-handles acfg)))
-;;;   ;; set up the rpc handler
-;;;   (let* ((th1  (make-thread
-;;; 		(lambda ()(start-server acfg))
-;;; 		"server thread"))
-;;; 	 (th2   (make-thread
-;;; 		 (lambda ()
-;;; 		   (print "th2 starting")
-;;; 		   (let loop ()
-;;; 		     (work-queue-processor acfg)
-;;; 		     (print "work-queue-processor crashed!")
-;;; 		     (loop)))
-;;; 		 "work queue thread")))
-;;;     (thread-start! th1)
-;;;     (thread-start! th2)
-;;;     (let loop ()
-;;;       (thread-sleep! 0.025)
-;;;       (if (area-ready acfg)
-;;; 	  #t
-;;; 	  (loop)))
-;;;     ;; attempt to fix my address
-;;;     (let* ((all-addr (get-all-ips-sorted)))	     ;; could use (tcp-addresses conn)?
-;;;       (let loop ((rem-addrs all-addr))
-;;; 	(if (null? rem-addrs)
-;;; 	    (begin
-;;; 	      (print "ERROR: Failed to figure out the ip address of myself as a server. Giving up.")
-;;; 	      (exit 1)) ;; BUG Changeme to raising an exception
-;;; 		
-;;; 	    (let* ((addr      (car rem-addrs))
-;;; 		   (good-addr (handle-exceptions
-;;; 				  exn
-;;; 				  #f
-;;; 				((rpc:procedure 'calling-addr addr (area-port acfg))))))
-;;; 	      (if good-addr
-;;; 		  (begin
-;;; 		    (print "Got good-addr of " good-addr)
-;;; 		    (area-myaddr-set! acfg good-addr))
-;;; 		  (loop (cdr rem-addrs)))))))
-;;;     (register-node acfg (area-myaddr acfg)(area-port acfg))
-;;;     (print "INFO: Server started on " (area-myaddr acfg) ":" (area-port acfg))
-;;;     ;; (update-known-servers acfg) ;; gotta do this on every start (thus why limit number of publicised servers)
-;;;     ))
-;;; 
-;;; (define (clear-server-pkt acfg)
-;;;   (let ((pktf (area-pktfile acfg)))
-;;;     (if pktf (delete-file* pktf))))
-;;; 
-;;; (define (shutdown acfg)
-;;;   (let (;;(conn (area-conn    acfg))
-;;; 	(pktf (area-pktfile acfg))
-;;; 	(port (area-port    acfg)))
-;;;     (if pktf (delete-file* pktf))
-;;;     (send-all "imshuttingdown")
-;;;     ;; (rpc:close-all-connections!) ;; don't know if this is actually needed
-;;;     (finalize-all-db-handles acfg)))
-;;; 
-;;; (define (send-all msg)
-;;;   #f)
-;;; 
-;;; ;; given a area record look up all the packets
-;;; ;;
-;;; (define (get-all-server-pkts acfg)
-;;;   (let ((all-pkt-files (glob (conc (area-pktsdir acfg) "/*.pkt"))))
-;;;     (map (lambda (pkt-file)
-;;; 	   (read-pkt->alist pkt-file pktspec: *pktspec*))
-;;; 	 all-pkt-files)))
-;;; 
-;;; #;((Z . "9a0212302295a19610d5796fce0370fa130758e9")
-;;;   (port . "34827")
-;;;   (pid . "28748")
-;;;   (hostname . "zeus")
-;;;   (T . "server")
-;;;   (D . "1549427032.0"))
-;;; 
-;;; #;(define (get-my-best-address)
-;;;   (let ((all-my-addresses (get-all-ips))) ;; (vector->list (hostinfo-addresses (hostname->hostinfo (get-host-name))))))
-;;;     (cond
-;;;      ((null? all-my-addresses)
-;;;       (get-host-name))                                          ;; no interfaces?
-;;;      ((eq? (length all-my-addresses) 1)
-;;;       (ip->string (car all-my-addresses)))                      ;; only one to choose from, just go with it
-;;;      (else 
-;;;       (ip->string (car (filter (lambda (x)                      ;; take any but 127.
-;;; 				 (not (eq? (u8vector-ref x 0) 127)))
-;;; 			       all-my-addresses)))))))
-;;; 
-;;; ;; whoami? I am my pkt
-;;; ;;
-;;; (define (whoami? acfg)
-;;;   (hash-table-ref/default (area-hosts acfg)(area-pktid acfg) #f))
-;;; 
-;;; ;;======================================================================
-;;; ;; "Client side" operations
-;;; ;;======================================================================
-;;; 
-;;; (define (safe-call call-key host port . params)
-;;;   (handle-exceptions
-;;;    exn
-;;;    (begin
-;;;      (print "Call " call-key " to " host ":" port " failed")
-;;;      #f)
-;;;    (apply (rpc:procedure call-key host port) params)))
-;;; 
-;;; ;; ;; convert to/from string / sexpr
-;;; ;; 
-;;; ;; (define (string->sexpr str)
-;;; ;;   (if (string? str)
-;;; ;;       (with-input-from-string str read)
-;;; ;;       str))
-;;; ;; 
-;;; ;; (define (sexpr->string s)
-;;; ;;   (with-output-to-string (lambda ()(write s))))
-;;; 
-;;; ;; is the server alive?
-;;; ;;
-;;; (define (ping acfg host port)
-;;;   (let* ((myaddr     (area-myaddr acfg))
-;;; 	 (myport     (area-port   acfg))
-;;; 	 (start-time (current-milliseconds))
-;;; 	 (res        (if (and (equal? myaddr host)
-;;; 			      (equal? myport port))
-;;; 			 (real-ping acfg)
-;;; 			 ((rpc:procedure 'ping host port)))))
-;;;     (cons (- (current-milliseconds) start-time)
-;;; 	  res)))
-;;; 
-;;; ;; returns ( ipaddr port alist-fname=>randnum )
-;;; (define (real-ping acfg)
-;;;   `(,(area-myaddr acfg) ,(area-port acfg) ,(get-host-stats acfg)))
-;;; 
-;;; ;; is the server alive AND the queues processing?
-;;; ;;
-;;; #;(define (full-ping acfg servpkt)
-;;;   (let* ((start-time (current-milliseconds))
-;;; 	 (res        (send-message acfg servpkt '(full-ping) 'full-ping)))
-;;;     (cons (- (current-milliseconds) start-time)
-;;; 	  res))) ;; (equal? res "got ping"))))
-;;; 
-;;; 
-;;; ;; look up all pkts and get the server id (the hash), port, host/ip
-;;; ;; store this info in acfg
-;;; ;; return the number of responsive servers found
-;;; ;;
-;;; ;; DO NOT VERIFY THAT THE SERVER IS ALIVE HERE. This is called at times where the current server is not yet alive and cannot ping itself
-;;; ;;
-;;; (define (update-known-servers acfg)
-;;;   ;; readll all pkts
-;;;   ;; foreach pkt; if it isn't me ping the server; if alive, add to hosts hash, else rm the pkt
-;;;   (let* ((start-time (current-milliseconds))
-;;; 	 (all-pkts  (delete-duplicates
-;;; 		     (append (get-all-server-pkts acfg)
-;;; 			     (hash-table-values (area-hosts acfg)))))
-;;; 	 (hostshash (area-hosts acfg))
-;;; 	 (my-id     (area-pktid acfg))
-;;; 	 (pktsdir   (area-pktsdir acfg)) ;; needed to remove pkts from non-responsive servers
-;;; 	 (numsrvs   0)
-;;; 	 (delpkt    (lambda (pktsdir sid)
-;;; 		      (print "clearing out server " sid)
-;;; 		      (delete-file* (conc pktsdir "/" sid ".pkt"))
-;;; 		      (hash-table-delete! hostshash sid))))
-;;;     (area-last-srvup-set! acfg (current-seconds))
-;;;     (for-each
-;;;      (lambda (servpkt)
-;;;        (if (list? servpkt)
-;;; 	   ;; (pp servpkt)
-;;; 	   (let* ((shost (alist-ref 'ipaddr servpkt))
-;;; 		  (sport (any->number (alist-ref 'port servpkt)))
-;;; 		  (res   (handle-exceptions
-;;; 			  exn
-;;; 			  (begin
-;;; 			    ;; (print "INFO: bad server on " shost ":" sport)
-;;; 			    #f)
-;;; 			  (ping acfg shost sport)))
-;;; 		  (sid   (alist-ref 'Z servpkt)) ;; Z code is our name for the server
-;;; 		  (url   (conc shost ":" sport))
-;;; 		  )
-;;; 	     #;(if (or (not res)
-;;; 		     (null? res))
-;;; 		 (begin
-;;; 		   (print "STRANGE: ping of " url " gave " res)))
-;;; 	     
-;;; 	     ;; (print "Got " res " from " shost ":" sport)
-;;; 	     (match res
-;;; 		    ((qduration . payload)
-;;; 		     ;; (print "Server pkt:" (alist-ref 'ipaddr servpkt) ":" (alist-ref 'port servpkt)
-;;; 		     ;;        (if payload
-;;; 		     ;;            "Success" "Fail"))
-;;; 		     (match payload
-;;; 			    ((host port stats)
-;;; 			     ;; (print "From " host ":" port " got stats: " stats)
-;;; 			     (if (and host port stats)
-;;; 				 (let ((url (conc host ":" port)))
-;;; 				   (hash-table-set! hostshash sid servpkt)
-;;; 				   ;; store based on host:port
-;;; 				   (hash-table-set! (area-hoststats acfg) sid stats))
-;;; 				 (print "missing data from the server, not sure what that means!"))
-;;; 			     (set! numsrvs (+ numsrvs 1)))
-;;; 			    (#f
-;;; 			     (print "Removing pkt " sid " due to #f from server or failed ping")
-;;; 			     (delpkt pktsdir sid))
-;;; 			    (else
-;;; 			     (print "Got ")(pp res)(print " from server ")(pp servpkt) " but response did not match (#f/#t . msg)")))
-;;; 		    (else
-;;; 		     ;; here we delete the pkt - can't reach the server, remove it
-;;; 		     ;; however this logic is inadequate. we should mark the server as checked
-;;; 		     ;; and not good, if it happens a second time - then remove the pkt
-;;; 		     ;; or something similar. I.e. don't be too quick to assume the server is wedged or dead
-;;; 		     ;; could be it is simply too busy to reply
-;;; 		     (let ((bad-pings (hash-table-ref/default (area-health acfg) url 0)))
-;;; 		       (if (> bad-pings 1) ;; two bad pings - remove pkt
-;;; 			   (begin
-;;; 			     (print "INFO: " bad-pings " bad responses from " url ", deleting pkt " sid)
-;;; 			     (delpkt pktsdir sid))
-;;; 			   (begin
-;;; 			     (print "INFO: " bad-pings " bad responses from " shost ":" sport " not deleting pkt yet")
-;;; 			     (hash-table-set! (area-health acfg)
-;;; 					      url
-;;; 					      (+ (hash-table-ref/default (area-health acfg) url 0) 1))
-;;; 			     ))
-;;; 		       ))))
-;;; 	   ;; servpkt is not actually a pkt?
-;;; 	   (begin
-;;; 	     (print "Bad pkt " servpkt))))
-;;;      all-pkts)
-;;;     (sdbg> "update-known-servers" "end" start-time #f #f " found " numsrvs
-;;; 	   " servers, pkts: " (map (lambda (p)
-;;; 				     (alist-ref 'Z p))
-;;; 				   all-pkts))
-;;;     numsrvs))
-;;; 
-;;; (defstruct srvstat
-;;;   (numfiles 0)   ;; number of db files handled by this server - subtract 1 for the db being currently looked at
-;;;   (randnum  #f)  ;; tie breaker number assigned to by the server itself - applies only to the db under consideration
-;;;   (pkt      #f)) ;; the server pkt
-;;; 
-;;; ;;(define (srv->srvstat srvpkt)
-;;;   
-;;; ;; Get the server best for given dbname and key
-;;; ;;
-;;; ;;   NOTE: key is not currently used. The key points to the kind of query, this may be useful for directing read-only queries.
-;;; ;;
-;;; (define (get-best-server acfg dbname key)
-;;;   (let* (;; (servers (hash-table-values (area-hosts acfg)))
-;;; 	 (servers     (area-hosts acfg))
-;;; 	 (skeys       (sort (hash-table-keys servers) string>=?)) ;; a stable listing
-;;; 	 (start-time  (current-milliseconds))
-;;; 	 (srvstats    (make-hash-table))  ;; srvid => srvstat
-;;; 	 (url         (conc (area-myaddr acfg) ":" (area-port acfg))))
-;;;     ;; (print "scores for " dbname ": " (map (lambda (k)(cons k (calc-server-score acfg dbname k))) skeys))
-;;;     (if (null? skeys)
-;;; 	(if (> (update-known-servers acfg) 0)
-;;; 	    (get-best-server acfg dbname key) ;; some risk of infinite loop here, TODO add try counter
-;;; 	    (begin
-;;; 	      (print "ERROR: no server found!") ;; since this process is also a server this should never happen
-;;; 	      #f))
-;;; 	(begin
-;;; 	  ;; (print "in get-best-server with skeys=" skeys)
-;;; 	  (if (> (- (current-seconds) (area-last-srvup acfg)) 10)
-;;; 	      (begin
-;;; 		(update-known-servers acfg)
-;;; 		(sdbg> "get-best-server" "update-known-servers" start-time #f #f)))
-;;; 
-;;; 	  ;; for each server look at the list of dbfiles, total number of dbs being handled
-;;; 	  ;; and the rand number, save the best host
-;;; 	  ;; also do a delist-db for each server dbfile not used
-;;; 	  (let* ((best-server       #f)
-;;; 		 (servers-to-delist (make-hash-table)))
-;;; 	    (for-each
-;;; 	     (lambda (srvid)
-;;; 	       (let* ((server    (hash-table-ref/default servers srvid #f))
-;;; 		      (stats     (hash-table-ref/default (area-hoststats acfg) srvid '(()))))
-;;; 		 ;; (print "stats: " stats)
-;;;  		 (if server
-;;; 		     (let* ((dbweights (car stats))
-;;; 			    (srvload   (length (filter (lambda (x)(not (equal? dbname (car x)))) dbweights)))
-;;; 			    (dbrec     (alist-ref dbname dbweights equal?))  ;; get the pair with fname . randscore
-;;; 			    (randnum   (if dbrec
-;;; 					   dbrec ;; (cdr dbrec)
-;;; 					   0)))
-;;; 		       (hash-table-set! srvstats srvid (make-srvstat numfiles: srvload randnum: randnum pkt: server))))))
-;;; 	     skeys)
-;;; 	    
-;;; 	    (let* ((sorted    (sort (hash-table-values srvstats) 
-;;; 				    (lambda (a b)
-;;; 				      (let ((numfiles-a (srvstat-numfiles a))
-;;; 					    (numfiles-b (srvstat-numfiles b))
-;;; 					    (randnum-a  (srvstat-randnum a))
-;;; 					    (randnum-b  (srvstat-randnum b)))
-;;; 					(if (< numfiles-a numfiles-b) ;; Note, I don't think adding an offset works here. Goal was only move file handling to a different server if it has 2 less
-;;; 					    #t
-;;; 					    (if (and (equal? numfiles-a numfiles-b)
-;;; 						     (< randnum-a randnum-b))
-;;; 						#t
-;;; 						#f))))))
-;;; 		   (best      (if (null? sorted)
-;;; 				  (begin
-;;; 				    (print "ERROR: should never be null due to self as server.")
-;;; 				    #f)
-;;; 				  (srvstat-pkt (car sorted)))))
-;;; 	      #;(print "SERVER(" url "): " dbname ": " (map (lambda (srv)
-;;; 							    (let ((p (srvstat-pkt srv)))
-;;; 							      (conc (alist-ref 'ipaddr p) ":" (alist-ref 'port p)
-;;; 								    "(" (srvstat-numfiles srv)","(srvstat-randnum srv)")")))
-;;; 							    sorted))
-;;; 	      best))))))
-;;;     
-;;;     ;; send out an "I'm about to exit notice to all known servers"
-;;;     ;;
-;;; (define (death-imminent acfg)
-;;;   '())
-;;; 
-;;; ;;======================================================================
-;;; ;; U L E X  -  T H E   I N T E R E S T I N G   S T U F F ! !
-;;; ;;======================================================================
-;;; 
-;;; ;; register a handler
-;;; ;;   NOTES:
-;;; ;;     dbinitsql   is reserved for a list of sql statements for initializing the db
-;;; ;;     dbinitfn    is reserved for a db init function, if exists called after dbinitsql
-;;; ;;     
-;;; (define (register acfg key obj #!optional (ctype 'dbwrite))
-;;;   (let ((ht (area-rtable acfg)))
-;;;     (if (hash-table-exists? ht key)
-;;; 	(print "WARNING: redefinition of entry " key))
-;;;     (hash-table-set! ht key (make-calldat obj: obj ctype: ctype))))
-;;; 
-;;; ;; usage: register-batch acfg '((key1 . sql1) (key2 . sql2) ... )
-;;; ;; NB// obj is often an sql query
-;;; ;;
-;;; (define (register-batch acfg ctype data)
-;;;   (let ((ht (area-rtable acfg)))
-;;;     (map (lambda (dat)
-;;; 	   (hash-table-set! ht (car dat)(make-calldat obj: (cdr dat) ctype: ctype)))
-;;; 	 data)))
-;;; 
-;;; (define (initialize-area-calls-from-specfile area specfile)
-;;;   (let* ((callspec (with-input-from-file specfile read )))
-;;;     (for-each (lambda (group)
-;;;                 (register-batch
-;;;                  area
-;;;                  (car group)
-;;;                  (cdr group)))
-;;;               callspec)))
-;;; 
-;;; ;; get-rentry
-;;; ;;
-;;; (define (get-rentry acfg key)
-;;;   (hash-table-ref/default (area-rtable acfg) key #f))
-;;; 
-;;; (define (get-rsql acfg key)
-;;;   (let ((cdat (get-rentry acfg key)))
-;;;     (if cdat
-;;; 	(calldat-obj cdat)
-;;; 	#f)))
-;;; 
-;;; 
-;;; 
-;;; ;; blocking call:
-;;; ;;    client                         server
-;;; ;;    ------                         ------
-;;; ;;    call()
-;;; ;;    send-message()
-;;; ;;    nmsg-send()
-;;; ;;                                   nmsg-receive()
-;;; ;;                                   nmsg-respond(ack,cookie)
-;;; ;;    ack, cookie
-;;; ;;    mbox-thread-wait(cookie)
-;;; ;;                                   nmsg-send(client,cookie,result)
-;;; ;;        nmsg-respond(ack)
-;;; ;;        return result
-;;; ;;
-;;; ;; reserved action:
-;;; ;;    'immediate
-;;; ;;    'dbinitsql
-;;; ;;
-;;; (define (call acfg dbname action params #!optional (count 0))
-;;;   (let* ((call-start-time     (current-milliseconds))
-;;; 	 (srv                 (get-best-server acfg dbname action))
-;;; 	 (post-get-start-time (current-milliseconds))
-;;; 	 (rdat                (hash-table-ref/default (area-rtable acfg) action #f))
-;;; 	 (myid                (trim-pktid (area-pktid acfg)))
-;;; 	 (srvid               (trim-pktid (alist-ref 'Z srv)))
-;;; 	 (cookie              (make-cookie myid)))
-;;;     (sdbg> "call" "get-best-server" call-start-time #f call-start-time " from: " myid " to server: " srvid " for " dbname " action: " action " params: " params " rdat: " rdat)
-;;;     (print "INFO: call to " (alist-ref 'ipaddr srv) ":" (alist-ref 'port srv) " from " (area-myaddr acfg) ":" (area-port acfg) " for " dbname)
-;;;     (if (and srv rdat) ;; need both to dispatch a request
-;;; 	(let* ((ripaddr  (alist-ref 'ipaddr srv))
-;;; 	       (rsrvid   (alist-ref 'Z srv))
-;;; 	       (rport    (any->number (alist-ref 'port   srv)))
-;;; 	       (res-full (if (and (equal? ripaddr (area-myaddr acfg))
-;;; 				  (equal? rport   (area-port acfg)))
-;;; 			     (request acfg ripaddr rport (area-pktid acfg) action cookie dbname params)
-;;; 			     (safe-call 'request ripaddr rport
-;;; 					(area-myaddr acfg)
-;;; 					(area-port   acfg)
-;;; 					#;(area-pktid acfg)
-;;; 					rsrvid
-;;; 					action cookie dbname params))))
-;;; 	  ;; (print "res-full: " res-full)
-;;; 	  (match res-full
-;;; 	    ((response-ok response-msg rem ...)
-;;; 	     (let* ((send-message-time (current-milliseconds))
-;;; 		    ;; (match res-full
-;;; 		    ;;  ((response-ok response-msg)
-;;; 		    ;; (response-ok  (car res-full))
-;;; 		    ;; (response-msg (cadr res-full)
-;;; 		    )
-;;; 	       ;; (res (take res-full 3))) ;; ctype == action, TODO: converge on one term <<=== what was this? BUG 
-;;; 	       ;; (print "ulex:call: send-message took " (- send-message-time post-get-start-time) " ms params=" params)
-;;; 	       (sdbg> "call" "send-message" post-get-start-time #f call-start-time)
-;;; 	       (cond
-;;; 		((not response-ok) #f)
-;;; 		((member response-msg '("db read submitted" "db write submitted"))
-;;; 		 (let* ((cookie-id   (cadddr res-full))
-;;; 			(mbox        (make-mailbox))
-;;; 			(mbox-time   (current-milliseconds)))
-;;; 		   (hash-table-set! (area-cookie2mbox acfg) cookie-id mbox)
-;;; 		   (let* ((mbox-timeout-secs    20)
-;;; 			  (mbox-timeout-result 'MBOX_TIMEOUT)
-;;; 			  (res                  (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result))
-;;; 			  (mbox-receive-time    (current-milliseconds)))
-;;; 		     (hash-table-delete! (area-cookie2mbox acfg) cookie-id)
-;;; 		     (sdbg> "call" "mailbox-receive" mbox-time #f call-start-time " from: " myid " to server: " srvid " for " dbname)
-;;; 		     ;; (print "ulex:call mailbox-receive took " (- mbox-receive-time mbox-time) "ms params=" params)
-;;; 		     res)))
-;;; 		(else
-;;; 		 (print "Unhandled response \""response-msg"\"")
-;;; 		 #f))
-;;; 	       ;; depending on what action (i.e. ctype) is we will block here waiting for
-;;; 	       ;; all the data (mechanism to be determined)
-;;; 	       ;;
-;;; 	       ;; if res is a "working on it" then wait
-;;; 	       ;;    wait for result
-;;; 	       ;; mailbox thread wait on 
-;;; 	       
-;;; 	       ;; if res is a "can't help you" then try a different server
-;;; 	       ;; if res is a "ack" (e.g. for one-shot requests) then return res
-;;; 	       ))
-;;; 	    (else
-;;; 	     (if (< count 10)
-;;; 		 (let* ((url (conc (alist-ref 'ipaddr srv) ":" (alist-ref 'port srv))))
-;;; 		   (thread-sleep! 1)
-;;; 		   (print "ERROR: Bad result from " url ", dbname: " dbname ", action: " action ", params: " params ". Trying again in 1 second.")
-;;; 		   (call acfg dbname action params (+ count 1)))
-;;; 		 (begin
-;;; 		   (error (conc "ERROR: " count " tries, still have improper response res-full=" res-full)))))))
-;;; 	(begin
-;;; 	  (if (not rdat)
-;;; 	      (print "ERROR: action " action " not registered.")
-;;; 	      (if (< count 10)
-;;; 		 (begin
-;;; 		   (thread-sleep! 1)
-;;; 		   (area-hosts-set! acfg (make-hash-table)) ;; clear out all known hosts
-;;; 		   (print "ERROR: no server found, srv=" srv ", trying again in 1 seconds")
-;;; 		   (call acfg dbname action params (+ count 1)))
-;;; 		 (begin
-;;; 		   (error (conc "ERROR: no server found after 10 tries, srv=" srv ", giving up."))
-;;; 		   #;(error "No server available"))))))))
-;;; 
-;;; 
-;;; ;;======================================================================
-;;; ;; U T I L I T I E S 
-;;; ;;======================================================================
-;;; 
-;;; ;; get a signature for identifing this process
-;;; ;;
-;;; (define (get-process-signature)
-;;;   (cons (get-host-name)(current-process-id)))
-;;; 
-;;; ;;======================================================================
-;;; ;; S Y S T E M   S T U F F
-;;; ;;======================================================================
-;;; 
-;;; ;; get normalized cpu load by reading from /proc/loadavg and
-;;; ;; /proc/cpuinfo return all three values and the number of real cpus
-;;; ;; and the number of threads returns alist '((adj-cpu-load
-;;; ;; . normalized-proc-load) ... etc.  keys: adj-proc-load,
-;;; ;; adj-core-load, 1m-load, 5m-load, 15m-load
-;;; ;;
-;;; (define (get-normalized-cpu-load)
-;;;   (let ((res (get-normalized-cpu-load-raw))
-;;; 	(default `((adj-proc-load . 2) ;; there is no right answer
-;;; 		   (adj-core-load . 2)
-;;; 		   (1m-load       . 2)
-;;; 		   (5m-load       . 0) ;; causes a large delta - thus causing default of throttling if stuff goes wrong
-;;; 		   (15m-load      . 0)
-;;; 		   (proc          . 1)
-;;; 		   (core          . 1)
-;;; 		   (phys          . 1)
-;;; 		   (error         . #t))))
-;;;     (cond
-;;;      ((and (list? res)
-;;; 	   (> (length res) 2))
-;;;       res)
-;;;      ((eq? res #f)   default) ;; add messages?
-;;;      ((eq? res #f) default)   ;; this would be the #eof
-;;;      (else default))))
-;;; 
-;;; (define (get-normalized-cpu-load-raw)
-;;;   (let* ((actual-host           (get-host-name))) ;; #f is localhost
-;;;     (let ((data  (append 
-;;; 		  (with-input-from-file "/proc/loadavg" read-lines)
-;;; 		  (with-input-from-file "/proc/cpuinfo" read-lines)
-;;; 		  (list "end")))
-;;; 	  (load-rx  (regexp "^([\\d\\.]+)\\s+([\\d\\.]+)\\s+([\\d\\.]+)\\s+.*$"))
-;;; 	  (proc-rx  (regexp "^processor\\s+:\\s+(\\d+)\\s*$"))
-;;; 	  (core-rx  (regexp "^core id\\s+:\\s+(\\d+)\\s*$"))
-;;; 	  (phys-rx  (regexp "^physical id\\s+:\\s+(\\d+)\\s*$"))
-;;; 	  (max-num  (lambda (p n)(max (string->number p) n))))
-;;;       ;; (print "data=" data)
-;;;       (if (null? data) ;; something went wrong
-;;; 	  #f
-;;; 	  (let loop ((hed      (car data))
-;;; 		     (tal      (cdr data))
-;;; 		     (loads    #f)
-;;; 		     (proc-num 0)  ;; processor includes threads
-;;; 		     (phys-num 0)  ;; physical chip on motherboard
-;;; 		     (core-num 0)) ;; core
-;;; 	    ;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num)
-;;; 	    (if (null? tal) ;; have all our data, calculate normalized load and return result
-;;; 		(let* ((act-proc (+ proc-num 1))
-;;; 		       (act-phys (+ phys-num 1))
-;;; 		       (act-core (+ core-num 1))
-;;; 		       (adj-proc-load (/ (car loads) act-proc))
-;;; 		       (adj-core-load (/ (car loads) act-core))
-;;; 		       (result
-;;; 			(append (list (cons 'adj-proc-load adj-proc-load)
-;;; 				      (cons 'adj-core-load adj-core-load))
-;;; 				(list (cons '1m-load (car loads))
-;;; 				      (cons '5m-load (cadr loads))
-;;; 				      (cons '15m-load (caddr loads)))
-;;; 				(list (cons 'proc act-proc)
-;;; 				      (cons 'core act-core)
-;;; 				      (cons 'phys act-phys)))))
-;;; 		  result)
-;;; 		(regex-case
-;;; 		    hed
-;;; 		  (load-rx  ( x l1 l5 l15 ) (loop (car tal)(cdr tal)(map string->number (list l1 l5 l15)) proc-num phys-num core-num))
-;;; 		  (proc-rx  ( x p         ) (loop (car tal)(cdr tal) loads           (max-num p proc-num) phys-num core-num))
-;;; 		  (phys-rx  ( x p         ) (loop (car tal)(cdr tal) loads           proc-num (max-num p phys-num) core-num))
-;;; 		  (core-rx  ( x c         ) (loop (car tal)(cdr tal) loads           proc-num phys-num (max-num c core-num)))
-;;; 		  (else 
-;;; 		   (begin
-;;; 		     ;; (print "NO MATCH: " hed)
-;;; 		     (loop (car tal)(cdr tal) loads proc-num phys-num core-num))))))))))
-;;; 
-;;; (define (get-host-stats acfg)
-;;;   (let ((stats-hash (area-stats acfg)))
-;;;     ;; use this opportunity to remove references to dbfiles which have not been accessed in a while
-;;;     (for-each
-;;;      (lambda (dbname)
-;;;        (let* ((stats       (hash-table-ref stats-hash dbname))
-;;; 	      (last-access (stat-when stats)))
-;;; 	 (if (and (> last-access 0)                             ;; if zero then there has been no access
-;;; 		  (> (- (current-seconds) last-access) 10))     ;; not used in ten seconds
-;;; 	     (begin
-;;; 	       (print "Removing " dbname " from stats list")
-;;; 	       (hash-table-delete! stats-hash dbname) ;; remove from stats hash
-;;; 	       (stat-dbs-set! stats (hash-table-keys stats))))))
-;;;      (hash-table-keys stats-hash))
-;;;     
-;;;     `(,(hash-table->alist (area-dbs acfg)) ;; dbname => randnum
-;;;       ,(map (lambda (dbname)  ;; dbname is the db name
-;;; 	      (cons dbname (stat-when (hash-table-ref stats-hash dbname))))
-;;; 	    (hash-table-keys stats-hash))
-;;;       (cpuload . ,(get-normalized-cpu-load)))))
-;;;     #;(stats   . ,(map (lambda (k) ;; create an alist from the stats data
-;;; 		       (cons k (stat->alist (hash-table-ref (area-stats acfg) k))))
-;;; 		     (hash-table-keys (area-stats acfg))))
-;;; 
-;;; #;(trace
-;;;  ;; assv
-;;;  ;; cdr
-;;;  ;; caar
-;;;  ;; ;; cdr
-;;;  ;; call
-;;;  ;; finalize-all-db-handles
-;;;  ;; get-all-server-pkts
-;;;  ;; get-normalized-cpu-load
-;;;  ;; get-normalized-cpu-load-raw
-;;;  ;; launch
-;;;  ;; nmsg-send
-;;;  ;; process-db-queries
-;;;  ;; receive-message
-;;;  ;; std-peer-handler
-;;;  ;; update-known-servers
-;;;  ;; work-queue-processor
-;;;  )
-;;; 
-;;; ;;======================================================================
-;;; ;; netutil
-;;; ;;   move this back to ulex-netutil.scm someday?
-;;; ;;======================================================================
-;;; 
-;;; ;; #include <stdio.h>
-;;; ;; #include <netinet/in.h>
-;;; ;; #include <string.h>
-;;; ;; #include <arpa/inet.h>
-;;; 
-;;; (foreign-declare "#include \"sys/types.h\"")
-;;; (foreign-declare "#include \"sys/socket.h\"")
-;;; (foreign-declare "#include \"ifaddrs.h\"")
-;;; (foreign-declare "#include \"arpa/inet.h\"")
-;;; 
-;;; ;; get IP addresses from ALL interfaces
-;;; (define get-all-ips
-;;;   (foreign-safe-lambda* scheme-object ()
-;;;     "
-;;; 
-;;; // from https://stackoverflow.com/questions/17909401/linux-c-get-default-interfaces-ip-address :
-;;; 
-;;; 
-;;;     C_word lst = C_SCHEME_END_OF_LIST, len, str, *a;
-;;; //    struct ifaddrs *ifa, *i;
-;;; //    struct sockaddr *sa;
-;;; 
-;;;     struct ifaddrs * ifAddrStruct = NULL;
-;;;     struct ifaddrs * ifa = NULL;
-;;;     void * tmpAddrPtr = NULL;
-;;; 
-;;;     if ( getifaddrs(&ifAddrStruct) != 0)
-;;;       C_return(C_SCHEME_FALSE);
-;;; 
-;;; //    for (i = ifa; i != NULL; i = i->ifa_next) {
-;;;     for (ifa = ifAddrStruct; ifa != NULL; ifa = ifa->ifa_next) {
-;;;         if (ifa->ifa_addr->sa_family==AF_INET) { // Check it is
-;;;             // a valid IPv4 address
-;;;             tmpAddrPtr = &((struct sockaddr_in *)ifa->ifa_addr)->sin_addr;
-;;;             char addressBuffer[INET_ADDRSTRLEN];
-;;;             inet_ntop(AF_INET, tmpAddrPtr, addressBuffer, INET_ADDRSTRLEN);
-;;; //            printf(\"%s IP Address %s\\n\", ifa->ifa_name, addressBuffer);
-;;;             len = strlen(addressBuffer);
-;;;             a = C_alloc(C_SIZEOF_PAIR + C_SIZEOF_STRING(len));
-;;;             str = C_string(&a, len, addressBuffer);
-;;;             lst = C_a_pair(&a, str, lst);
-;;;         } 
-;;; 
-;;; //        else if (ifa->ifa_addr->sa_family==AF_INET6) { // Check it is
-;;; //            // a valid IPv6 address
-;;; //            tmpAddrPtr = &((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
-;;; //            char addressBuffer[INET6_ADDRSTRLEN];
-;;; //            inet_ntop(AF_INET6, tmpAddrPtr, addressBuffer, INET6_ADDRSTRLEN);
-;;; ////            printf(\"%s IP Address %s\\n\", ifa->ifa_name, addressBuffer);
-;;; //            len = strlen(addressBuffer);
-;;; //            a = C_alloc(C_SIZEOF_PAIR + C_SIZEOF_STRING(len));
-;;; //            str = C_string(&a, len, addressBuffer);
-;;; //            lst = C_a_pair(&a, str, lst);
-;;; //       }
-;;; 
-;;; //       else {
-;;; //         printf(\" not an IPv4 address\\n\");
-;;; //       }
-;;; 
-;;;     }
-;;; 
-;;;     freeifaddrs(ifa);
-;;;     C_return(lst);
-;;; 
-;;; "))
-;;; 
-;;; ;; Change this to bias for addresses with a reasonable broadcast value?
-;;; ;;
-;;; (define (ip-pref-less? a b)
-;;;   (let* ((rate (lambda (ipstr)
-;;;                  (regex-case ipstr
-;;;                              ( "^127\\." _ 0 )
-;;;                              ( "^(10\\.0|192\\.168\\.)\\..*" _ 1 )
-;;;                              ( else 2 ) ))))
-;;;     (< (rate a) (rate b))))
-;;;   
-;;; 
-;;; (define (get-my-best-address)
-;;;   (let ((all-my-addresses (get-all-ips))
-;;;         ;;(all-my-addresses-old (vector->list (hostinfo-addresses (hostname->hostinfo (get-host-name)))))
-;;;         )
-;;;     (cond
-;;;      ((null? all-my-addresses)
-;;;       (get-host-name))                                          ;; no interfaces?
-;;;      ((eq? (length all-my-addresses) 1)
-;;;       (car all-my-addresses))                      ;; only one to choose from, just go with it
-;;;      
-;;;      (else
-;;;       (car (sort all-my-addresses ip-pref-less?)))
-;;;      ;; (else 
-;;;      ;;  (ip->string (car (filter (lambda (x)                      ;; take any but 127.
-;;;      ;;    			 (not (eq? (u8vector-ref x 0) 127)))
-;;;      ;;    		       all-my-addresses))))
-;;; 
-;;;      )))
-;;; 
-;;; (define (get-all-ips-sorted)
-;;;   (sort (get-all-ips) ip-pref-less?))
-;;; 
-;;; 
-
+  (map address-info-host
+       (filter (lambda (x)
+		 (equal? (address-info-type x) "tcp"))
+	       (address-infos (get-host-name)))))
+
+)

ADDED   utils/load-the-db.scm
Index: utils/load-the-db.scm
==================================================================
--- /dev/null
+++ utils/load-the-db.scm
@@ -0,0 +1,41 @@
+;; start the repl and then load this file
+
+(define start-time (current-seconds))
+
+(let loop ((last-print 0)
+	   (num-calls  0))
+  (let* ((all-run-ids (rmt:get-all-run-ids))
+	 (do-print    (> (- (current-seconds) last-print) 2))
+	 (max-query   0)
+	 (num-calls   (+ num-calls
+			 1                    ;; account for call above
+			 (length all-run-ids) ;; account for the get-tests-for-run in the for-each below
+			 )))
+    (for-each
+     (lambda (run-id)
+       ;; (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)
+       (let* ((all-run-data (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f #f #f)))
+	 (set! num-calls (+ num-calls (length all-run-data)))
+	 (for-each
+	  (lambda (testdat)
+	    (let* ((test-id (vector-ref testdat 0))
+		   (start-at (current-milliseconds))
+		   (testinfo (rmt:get-test-info-by-id run-id test-id))
+		   (query-time (- (current-milliseconds) start-at)))
+	      (if (> query-time max-query)
+		  (set! max-query query-time))))
+	  all-run-data)
+	 (if do-print
+	     (let* ((run-time (- (current-seconds) start-time))
+		    (qry-rate (if (> run-time 0)
+				  (inexact->exact (round (/  num-calls run-time)))
+				  -1)))
+		(print "Running "run-time"s, run "run-id
+		    " has "(length all-run-data)" tests, max query "max-query
+		    "ms with avg query rate "qry-rate" qry/s")))))
+     all-run-ids)
+    (loop (if do-print
+	      (current-seconds)
+	      last-print)
+	  num-calls)))
+

Index: utils/nbfake
==================================================================
--- utils/nbfake
+++ utils/nbfake
@@ -39,10 +39,11 @@
 nbfake behavior can be changed by setting the following env vars:
    NBFAKE_HOST       SSH to \$NBFAKE_HOST and run command
    NBFAKE_LOG        Logfile for nbfake output
    NB_WASH_GROUPS    comma-separated list of groups to wash into
    NB_WASH_ENABLED   must be set in order to enable wash groups
+   NBFAKE_QUIET      set to suppress informational output
 
 __EOF
   exit
 fi
 
@@ -87,19 +88,21 @@
 
 #==============================================================================
 # Run and log
 #==============================================================================
 
+if [[ -z "$NBFAKE_QUIET" ]];then
 cat <<__EOF >&2
 #======================================================================
 # NBFAKE logging command to: $MY_NBFAKE_LOG
 #     $WASHCMD $*
 #======================================================================
 __EOF
+fi
 
 if [[ -z "$MY_NBFAKE_HOST" ]]; then
   # Run locally
   sh -c "cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=\"$PATH\"; nohup $WASHCMD $* >> $MY_NBFAKE_LOG 2>&1 &"
 else
   # run remotely
   ssh -X -n -f $MY_NBFAKE_HOST "sh -c \"cd $CURRWD;export DISPLAY=$DISPLAY; export PATH=\"$PATH\"; nohup $WASHCMD $* >> $MY_NBFAKE_LOG 2>&1 &\""
 fi

Index: utils/plot-code.scm
==================================================================
--- utils/plot-code.scm
+++ utils/plot-code.scm
@@ -23,21 +23,41 @@
 ;;        dot -Tpdf plot.dot > plot.pdf
 ;; first param is comma separated list of files to include in the map, use - to do all
 ;; second param is list of regexs for functions to include in the map
 ;; third param is list of files to scan
 
-(use regex srfi-69 srfi-13)
+;; (use regex srfi-69 srfi-1 srfi-13)
+
+(module plot-code
+	*
+
+(import scheme chicken.base chicken.port chicken.string chicken.io)
+(import chicken.process-context)	
+(import	regex srfi-1 srfi-69 srfi-13 matchable)
 
+(define files #f)
+(define targs #f)
+(define function-patt #f)
 (define targs #f) 
-(define files (cdr (cddddr (argv))))
-
-(let ((targdat (cadddr (argv))))
-  (if (equal? targdat "-")
-      (set! targs files)
-      (set! targs (string-split targdat ","))))
-
-(define function-patt (car (cdr (cdddr (argv)))))
+
+(match (command-line-arguments)
+       ((targfiles fnrx . scanfiles)
+	(set! targs (string-split-fields "," targfiles #:infix))
+	(set! function-patt fnrx)
+	(set! files scanfiles))
+       (else
+	(print "Usage: plot-code file1.scm,file2.scm *.scm > plot.dot
+ dot -Tpdf plot.dot > plot.pdf")
+	(exit)))
+
+;; (define files (cdr (cddddr (argv))))
+;; 
+;; (let ((targdat (cadddr (argv))))
+;;   (if (equal? targdat "-")
+;;       (set! targs files)
+;;       (set! targs (string-split targdat ","))))
+
 (define function-rx   (regexp function-patt))
 (define filedat-defns (make-hash-table))
 (define filedat-usages (make-hash-table))
 
 (define defn-rx (regexp "^\\s*\\(define\\s+\\(([^\\s\\)]+).*"))
@@ -197,5 +217,6 @@
  function-calls)
 
 (print "}")
 
 (exit)
+)

ADDED   utils/plot-uses.scm
Index: utils/plot-uses.scm
==================================================================
--- /dev/null
+++ utils/plot-uses.scm
@@ -0,0 +1,147 @@
+#!/mfs/pkgs/chicken/4.8.0.5/bin/csi -nbq
+
+;;  Copyright 2006-2017, Matthew Welland.
+;; 
+;; This file is part of Megatest.
+;; 
+;;     Megatest is free software: you can redistribute it and/or modify
+;;     it under the terms of the GNU General Public License as published by
+;;     the Free Software Foundation, either version 3 of the License, or
+;;     (at your option) any later version.
+;; 
+;;     Megatest is distributed in the hope that it will be useful,
+;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;     GNU General Public License for more details.
+;; 
+;;     You should have received a copy of the GNU General Public License
+;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+;;
+
+;; Coming soon (right?) Usage: plot-code file1.scm,file2.scm "fun1,fun2,x*" *.scm > plot.dot
+;; Usage: plot-code file1.scm,file2.scm *.scm > plot.dot
+;;        dot -Tpdf plot.dot > plot.pdf
+;; first param is comma separated list of files to include in the map, use - to do all
+;; second param is list of regexs for functions to include in the map
+;; third param is list of files to scan
+
+(module plot-uses
+	*
+
+(import scheme chicken)
+
+(use regex srfi-69 srfi-13)
+(use matchable data-structures ports extras)
+
+(define unituses-rx (regexp "^\\(declare \\((unit|uses) ([^\\(\\)]+)\\).*"))
+
+(define (print-err . data)
+  (with-output-to-port (current-error-port)
+    (lambda ()
+      (apply print data))))
+
+(define (process-file ignores fname)
+  (with-input-from-file fname
+    (lambda ()
+      (let loop ((modname "DUMMYMOD"))
+	(let* ((inl (read-line)))
+	  (if (eof-object? inl)
+	      #t
+	      (match (string-search unituses-rx inl)
+ 	         ((_ dtype unitname)
+		  (if (equal? dtype "unit")
+		      (loop unitname)
+		      (begin
+			(if (equal? dtype "uses")
+			    (if (not (or (member modname '("DUMMYMOD"))
+					 (member modname ignores)
+					 (member unitname ignores)))
+				(print "  \""unitname"\" -> \""modname"\";"))
+			    (print-err "ERROR: bad declare line \""inl"\""))
+			(loop modname))))
+		 (else
+		  (loop modname)))))))))
+
+(define (main)
+  (match (command-line-arguments)
+    (("todot" ignoreunits . files)
+     (let* ((ignores (string-split ignoreunits ",")))
+       (print-err "Making graph for files: " (string-intersperse files ", "))
+       (print "digraph uses_unit {")
+       (for-each
+	(lambda (fname)
+	  (print "// Filename: "fname)
+	  (process-file ignores fname))
+	files)
+       (print "}")))
+    (else
+     (print-err "Usage: plot-uses u1,u2... file1.scm ...")
+     (print-err "    where u1,u2... are units to ignore and file1.scm... are the files to process."))))
+
+(main)
+
+)
+;; 
+;; ;; Gather the usages
+;; (print "digraph G {")
+;; (define curr-cluster-num 0)
+;; (define function-calls '())
+;; 
+;; (for-each
+;;  (lambda (fname)
+;;    (let ((last-func #f))
+;;      (print-err "Processing file " fname)
+;;      (print "subgraph cluster_" curr-cluster-num " {")
+;;      (set! curr-cluster-num (+ curr-cluster-num 1))
+;;      (with-input-from-file fname
+;;        (lambda ()
+;; 	 (with-output-to-port (current-error-port)
+;; 	   (lambda ()
+;; 	     (print "Analyzing file " fname)))
+;; 	 (print "label=\"" fname "\";")
+;; 	 (let loop ((inl    (read-line))
+;; 		    (fnname "toplevel")
+;; 		    (allcalls '()))
+;; 	   (if (eof-object? inl)
+;; 	       (begin
+;; 		 (set! function-calls (cons (list fnname allcalls) function-calls))
+;; 		 (for-each 
+;; 		  (lambda (call-name)
+;; 		    (hash-table-set! breadcrumbs call-name #t))
+;; 		  allcalls)
+;; 		 (print-err "function: " fnname " allcalls: " allcalls))
+;; 	       (let ((match (string-match defn-rx inl)))
+;; 		 (if match
+;; 		     (let ((func-name (cadr match)))
+;; 		       (if last-func
+;; 			   (print "\"" func-name "\" -> \"" last-func "\";")
+;; 			   (print "\"" func-name "\";"))
+;; 		       (set! last-func func-name)
+;; 		       (hash-table-set! breadcrumbs func-name #t)
+;; 		       (loop (read-line)
+;; 			     func-name
+;; 			     allcalls))
+;; 		     (let ((calls (look-for-all-calls inl fnname)))
+;; 		       (loop (read-line) fnname (append allcalls calls)))))))))
+;;      (print "}")))
+;;  targs)
+;; 
+;; (print-err "breadcrumbs: " (hash-table-keys breadcrumbs))
+;; (print-err "function-calls: " function-calls)
+;; 
+;; (for-each 
+;;  (lambda (function-call)
+;;    (print-err "function-call: " function-call)
+;;    (let ((fnname (car function-call))
+;; 	 (calls  (cadr function-call)))
+;;      (for-each
+;;       (lambda (callname)
+;; 	(print (if (hash-table-ref/default breadcrumbs callname #f) "" "// ")
+;; 	       "\"" fnname "\" -> \"" callname "\";"))
+;;       calls)))
+;;  function-calls)
+;; 
+;; (print "}")
+;; 
+;; (exit)
+;;