Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -1,5 +1,7 @@
+
+
 ;;======================================================================
 ;; Copyright 2006-2013, Matthew Welland.
 ;; 
 ;; This file is part of Megatest.
 ;; 
@@ -163,11 +165,11 @@
             (params            (vector-ref dat 1))
             (start-t           (current-milliseconds))
             (readonly-mode     (dbr:dbstruct-read-only dbstruct))
             (readonly-command  (member cmd api:read-only-queries))
             (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
-            (foo               (begin
+            #;(foo               (begin
                                  (common:telemetry-log (conc "api-in:"(->string cmd))
                                                        payload: `((params . ,params)))
                                  
                                  #t))
             (res    
@@ -359,16 +361,16 @@
 			 start-t)))
 	 (hash-table-set! *db-api-call-time* cmd
 			  (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
        (if writecmd-in-readonly-mode
            (begin
-             (common:telemetry-log (conc "api-out:"(->string cmd))
+             #;(common:telemetry-log (conc "api-out:"(->string cmd))
                                    payload: `((params . ,params)
                                               (ok-res . #t)))
 	     (vector #f res))
            (begin
-             (common:telemetry-log (conc "api-out:"(->string cmd))
+             #;(common:telemetry-log (conc "api-out:"(->string cmd))
                                    payload: `((params . ,params)
                                               (ok-res . #f)))
              (vector #t res))))))))
 
 ;; http-server  send-response
@@ -377,26 +379,34 @@
 ;;
 ;; NB// Runs on the server as part of the server loop
 ;;
 (define (api:process-request dbstruct $) ;; the $ is the request vars proc
   (set! *api-process-request-count* (+ *api-process-request-count* 1))
+  (debug:print 4 *default-log-port* "server-id:"  *server-id*)
   (let* ((cmd     ($ 'cmd))
 	 (paramsj ($ 'params))
-	 (params  (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?)
-	 (resdat  (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result )
-	 (success (vector-ref resdat 0))
-	 (res     (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?)
-    (if (not success)
-	(debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params))
-    (if (> *api-process-request-count* *max-api-process-requests*)
-	(set! *max-api-process-requests* *api-process-request-count*))
-    (set! *api-process-request-count* (- *api-process-request-count* 1))
-    ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds
-    ;; (rmt:dat->json-str
-    ;;  (if (or (string? res)
-    ;;          (list?   res)
-    ;;          (number? res)
-    ;;          (boolean? res))
-    ;;      res 
-    ;;      (list "ERROR, not string, list, number or boolean" 1 cmd params res)))))
-    (db:obj->string res transport: 'http)))
+         (key     ($ 'key))   
+	 (params  (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?)
+    (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key)
+    (if (equal? key *server-id*)
+	(let* ((resdat  (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result )
+	       (success (vector-ref resdat 0))
+	       (res     (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?)
+	  (debug:print 4 *default-log-port* "res:" res)
+	  (if (not success)
+	      (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params))
+	  (if (> *api-process-request-count* *max-api-process-requests*)
+	      (set! *max-api-process-requests* *api-process-request-count*))
+	  (set! *api-process-request-count* (- *api-process-request-count* 1))
+	  ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds
+	  ;; (rmt:dat->json-str
+	  ;;  (if (or (string? res)
+	  ;;          (list?   res)
+	  ;;          (number? res)
+	  ;;          (boolean? res))
+	  ;;      res 
+	  ;;      (list "ERROR, not string, list, number or boolean" 1 cmd params res)))))
+	  (db:obj->string res transport: 'http))
+	(begin
+	  (debug:print 0 *default-log-port*   "Server refused to process request. Sever id mismatch. recived " key " expected:  " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) 
+	  (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http)))))
 

Index: archive.scm
==================================================================
--- archive.scm
+++ archive.scm
@@ -391,10 +391,11 @@
 ))  
 
 (define (archive:ls->list  bup-exe archive-dir internal-path)
   (let ((cmd (conc bup-exe " -d " archive-dir  " ls -l " internal-path "| awk '{print $6}' | sort"))
         (res '()))
+    (debug:print-info 0 *default-log-port* cmd)
     (handle-exceptions
      exn
      #f ;; anything goes wrong - assume the process in NOT running.
      (with-input-from-pipe 
       cmd
@@ -412,22 +413,23 @@
    (seconds->local-time sec)
    "%Y-%m-%d-%H%M%S"))
  
 
 (define (archive:get-timestamp-dir bup-exe archive-dir testsuite-name run-id test-partial-path test-last-update)
-    (print (seconds->std-time-str test-last-update)) 
+    (debug:print-info 0 *default-log-port* "Test last update time:" (seconds->std-time-str test-last-update)) 
     (let* ((internal-path (conc testsuite-name "-" run-id))
+           (archive-update-delay (string->number (or (configf:lookup *configdat* "archive" "test-update-delay") "900" )))  
            (ts-list (archive:ls->list  bup-exe archive-dir internal-path))
            (ds-flag (vector-ref (seconds->local-time) 8)))
            (let loop ((hed (car ts-list))
                        (tail (cdr ts-list)))
                    (if (and (null? tail) (equal? hed "latest"))
                         #f
                     (if (and (not (null? tail)) (equal? hed "latest"))
                         (loop (car tail) (cdr tail))
                   (let* ((archive-seconds (time-string->seconds hed ds-flag)))
-                    (if (< (abs (- archive-seconds test-last-update)) 120)
+                    (if (< (abs (- archive-seconds test-last-update)) archive-update-delay)
                     (let* ((test-list (archive:ls->list  bup-exe archive-dir (conc internal-path "/" hed "/" test-partial-path))))
                            (if (> (length test-list) 0)
                                hed
                                (if (not (null? tail)) 
                                  (loop (car tail) (cdr tail))
@@ -481,10 +483,11 @@
 	      (exclude-file            (args:get-arg "-exclude-rx-from")))
 	 (if (not archive-timestamp-dir)
                (debug:print-error 0 *default-log-port* "Archive not found for testsuite" (common:get-testsuite-name) " run/test/itempath" test-partial-path)
          (begin    
 	 ;; some sanity checks, move an existing path out of the way - iif it is not a toplevel with children
+         (debug:print-info 0 *default-log-port* "Archive time: " archive-timestamp-dir)
 	 (if (and (not toplevel/children)  ;; special handling needed for toplevel with children
 		  prev-test-physical-path
 		  (common:file-exists? prev-test-physical-path)) ;; what to do? abort or clean up or link it in?
 	     (let* ((base (pathname-directory prev-test-physical-path))
 		    (dirn (pathname-file      prev-test-physical-path))

Index: client.scm
==================================================================
--- client.scm
+++ client.scm
@@ -88,18 +88,25 @@
       (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath))
 	     (runremote  (or area-dat *runremote*)))
 	(if (not server-dat) ;; no server found
 	    (client:setup-http areapath remaining-tries: (- remaining-tries 1))
 	    (let ((host  (cadr  server-dat))
-		  (port  (caddr server-dat)))
+		  (port  (caddr server-dat))
+                  (server-id (caddr (cddr server-dat))))
 	      (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
 	      (if (and (not area-dat)
 		       (not *runremote*))
-		  (set! *runremote* (make-remote)))
-	      (if (and host port)
+                  (begin       
+		    (set! *runremote* (make-remote))
+                    (let* ((server-info (remote-server-info *runremote*))) 
+                      (if server-info
+                        (begin
+                          (remote-server-url-set! *runremote* (server:record->url server-info))
+                          (remote-server-id-set! *runremote* (server:record->id server-info)))))))
+	      (if (and host port server-id)
 		  (let* ((start-res (case *transport-type*
-				      ((http)(http-transport:client-connect host port))))
+				      ((http)(http-transport:client-connect host port server-id))))
 			 (ping-res  (case *transport-type* 
 				      ((http)(rmt:login-no-auto-client-setup start-res)))))
 		    (if (and start-res
 			     ping-res)
 			(let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -296,11 +296,13 @@
 (define (common:logpro-exit-code->test-status exit-code)
   (status-sym->string (common:logpro-exit-code->status-sym exit-code)))
 
 (defstruct remote
   (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
-  (server-url        (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
+  (server-url        #f) ;; (server:check-if-running *toppath*) #f))
+  (server-id         #f)
+  (server-info       (if *toppath* (server:check-if-running *toppath*)))
   (last-server-check 0)  ;; last time we checked to see if the server was alive
   (conndat           #f)
   (transport         *transport-type*)
   (server-timeout    (server:expiration-timeout))
   (force-server      #f)
@@ -567,11 +569,11 @@
 			     (- num-logs max-allowed))))
 	    (for-each
 	     (lambda (file)
 	       (let* ((fullname (conc "logs/" file)))
 		 (if (directory? fullname)
-		     (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
+		     (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
 		     (handle-exceptions
 		      exn
 		      (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
 		      (delete-file* fullname)))))
 	     files)
@@ -3546,14 +3548,14 @@
                #t) ;; just ignore it, it might have died in the meantime so joining it will throw an exception
              (thread-join! thread))
            )))
    (hash-table-keys *common:thread-punchlist*)))
 
-(define *common:telemetry-log-state* 'startup)
-(define *common:telemetry-log-socket* #f)
+#;(define *common:telemetry-log-state* 'startup)
+#;(define *common:telemetry-log-socket* #f)
 
-(define (common:telemetry-log-open)
+#;(define (common:telemetry-log-open)
   (if (eq? *common:telemetry-log-state* 'startup)
       (let* ((serverhost (configf:lookup *configdat* "telemetry" "host"))
              (serverport (configf:lookup-number *configdat* "telemetry" "port"))
              (user (or (get-environment-variable "USER") "unknown"))
              (host (or (get-environment-variable "HOST") "unknown")))
@@ -3569,11 +3571,11 @@
                      (udp-connect! s serverhost serverport)
                      (set! *common:telemetry-log-socket* s)
                      'open)
                    'not-needed))))))
   
-(define (common:telemetry-log event #!key (payload '()))
+#;(define (common:telemetry-log event #!key (payload '()))
   (if (eq? *common:telemetry-log-state* 'startup)
       (common:telemetry-log-open))
 
   (if (eq? 'open *common:telemetry-log-state*)
       (handle-exceptions
@@ -3596,11 +3598,11 @@
                      (with-output-to-string (lambda () (pp payload))))))
                   (msg     (conc user":"host":"start":"(current-process-id)":"(car (argv))":"
                                  toppath":"payload-serialized)))
              (udp-send *common:telemetry-log-socket* msg))))))
   
-(define (common:telemetry-log-close)
+#;(define (common:telemetry-log-close)
   (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*)
     (handle-exceptions
      exn
      (begin
        (define *common:telemetry-log-state* 'closed-fail)

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1534,11 +1534,11 @@
                               state        TEXT DEFAULT 'new',
                               status       TEXT DEFAULT 'n/a',
                               archive_type TEXT DEFAULT 'bup',
                               du           INTEGER,
                               archive_path TEXT);")))
-         (print "creating trigges from init") 
+         ;; (print "creating trigges from init") 
         (db:create-triggers db)    
      db)) ;; )
 
 ;;======================================================================
 ;; A R C H I V E S
@@ -3968,17 +3968,133 @@
 ;;    (db:set-state-status-and-roll-up-items dbstruct run-id test-id #f state status msg)
 ;;    ;; process the test_data table
 ;;    (if (and test-id state status (equal? status "AUTO")) 
 ;; 	(db:test-data-rollup dbstruct run-id test-id status))
 ;;    (mt:process-triggers dbstruct run-id test-id state status)))
+
+;; NOT FINISHED 
+(define (db:calc-state-status-toplevel state status tl-state tl-status)
+  `(,state ,status))
+
+;;   (match state
+;;     (("COMPLETED")
+;;      (match `(,tl-state ,tl-status)
+;;        (("COMPLETED" "PASS") `(,state ,status))
+;;        (("COMPLETED" thestatus)
+;; 	(case (string->symbol thestatus)
+;; 	  ((ABORT CHECK DEAD)
+;; 	   (if `("COMPLETED" ,thestatus))
+;; 	(match `(,thestatus ,status)
+;; 	  (("FAIL" "ABORT") '("COMPLETED" "ABORT"))
+;; 	  (("FAIL" "CHECK") '("COMPLETED" "CHECK"))
+;; 	  (("FAIL" "DEAD")  '("COMPLETED" "DEAD"))
+;; 	  (("WARN" "FAIL")  '("COMPLETED" "FAIL"))
+;; 	  (("WARN" "CHECK") '("COMPLETED" "CHECK"))
+;; 	  (("WARN" "DEAD")
+       
+(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
+  ;; establish info on incoming test followed by info on top level test
+  ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met
+  ;; (mutex-lock! *db-transaction-mutex*) ;; why do we need a mutex?
+  (let* ((testdat      (if (number? test-name)
+			   (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id
+			   (db:get-test-info       dbstruct run-id test-name item-path)))
+	 (test-id      (db:test-get-id testdat))
+	 (test-name    (if (number? test-name)
+			   (db:test-get-testname testdat)
+			   test-name))
+	 (no-sync-db   (db:no-sync-db #f))
+	 (rollup-flag  #f)
+	 (wait-flag    #f)
+	 (rollup-lock-key  (conc run-id "-rollup-" test-name))
+	 (waiting-lock-key (conc run-id "-waiting-" test-name)))
+    (db:test-set-state-status dbstruct run-id test-id state status #f)
+    (if (and test-id state status (equal? status "AUTO")) 
+	(db:test-data-rollup dbstruct run-id test-id status))
+    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
+	(db:general-call dbstruct 'set-test-start-time (list test-id)))
+
+    (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
+	(begin
+	  ;; is there a rollup lock? If not, take it
+	  (sqlite3:with-transaction
+	   no-sync-db
+	   (lambda ()
+	     ;; (debug:print 0 *default-log-port* "EXCEPTION: exn="exn)
+	     (let* ((rollup-lock-time  (db:no-sync-get/default no-sync-db rollup-lock-key #f))
+		    (waiting-lock-time (db:no-sync-get/default no-sync-db waiting-lock-key #f)))
+	       (if rollup-lock-time ;; someone is doing a rollup
+		   (if (not waiting-lock-time) ;; no one is waiting
+		       (begin
+			 (set! wait-flag #t)
+			 (set! rollup-flag #t)
+			 (db:no-sync-set no-sync-db waiting-lock-key (current-seconds)))) ;; we are going to wait
+		   (begin
+		     (set! rollup-flag #t)
+		     (db:no-sync-set no-sync-db rollup-lock-key (current-seconds)))))))
+	  (if wait-flag
+	      (let loop ((count 10)) ;; about 20 seconds
+		(thread-sleep! 2)
+		(if (and (not (db:no-sync-get/default no-sync-db waiting-lock-key #f))
+			 (> count 0))
+		    (loop (+ count 1))
+		    (sqlite3:with-transaction
+		     no-sync-db
+		     (lambda ()
+		       (db:no-sync-set no-sync-db rollup-lock-key (current-seconds))
+		       (db:no-sync-del! no-sync-db waiting-lock-key))))))
+	  ;; now the rollup
+	  (if rollup-flag ;; put this into a thread
+	      (begin
+		;; (thread-start! (make-thread
+		;; 	      (lambda ()
+		(db:roll-up-test-state-status dbstruct run-id test-name state status)
+		(db:no-sync-del! no-sync-db rollup-lock-key))
+	      ;; (conc "thread for run-id: " run-id " test-name: " test-name))))))))
+	      )))))
+	      
+;; I'd like to remove the need for item-path - it is logically not needed here
+;; for now we pass in state and status - NOTE: There is a possible race if a test
+;; is rapidly re-run while an earlier run is waiting to rollup.
+;;
+(define (db:roll-up-test-state-status dbstruct run-id test-name state status)
+  (let* ((testdat      (if (number? test-name)
+			   (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id
+			   (db:get-test-info       dbstruct run-id test-name "")))
+	 (test-id      (db:test-get-id testdat))
+	 (test-name    (if (number? test-name)
+			   (db:test-get-testname testdat)
+			   test-name))
+	 (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
+	 (tl-test-id   (db:test-get-id tl-testdat)))
+    (db:with-db
+     dbstruct #f #f
+     (lambda (db)
+       ;; NB// Pass the db so it is part fo the transaction
+       ;; item-path is used in get-all-state-status counts to exclude current state/status of THIS test
+       ;; but with the state/status being set earlier this is not needed any longer
+       (let* ((state-status-counts  (db:get-all-state-status-counts-for-testname dbstruct run-id test-name))
+	      (state-statuses       (if (null? state-status-counts)
+					'()
+					(db:roll-up-rules state-status-counts state status)))
+	      (newstate             (if (null? state-statuses)
+					state
+					(car state-statuses)))
+	      (newstatus            (if (null? state-statuses)
+					status
+					(cadr state-statuses))))
+	 (if tl-test-id
+	     (db:test-set-state-status db run-id tl-test-id newstate newstatus #f))
+	 )))
+    #t))
 
 ;; state is the priority rollup of all states
 ;; status is the priority rollup of all completed statesfu
 ;;
 ;; if test-name is an integer work off that instead of test-name test-path
 ;;
-(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
+(define (db:set-state-status-and-roll-up-items-orig dbstruct run-id test-name item-path state status comment)
   ;; establish info on incoming test followed by info on top level test
   ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met
   (let* ((testdat      (if (number? test-name)
 			   (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id
 			   (db:get-test-info       dbstruct run-id test-name item-path)))
@@ -3999,17 +4115,17 @@
      (lambda (db)
        (let ((tr-res
               (sqlite3:with-transaction
                db
                (lambda ()
-                 ;; NB// Pass the db so it is part fo the transaction
+                 ;; NB// Pass the db so it is part of the transaction
                  (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status
                  (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
                      (let* ((state-status-counts  (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
-			      						  (state-stauses (db:roll-up-rules state-status-counts state status))
-                          (newstate (car state-stauses))
-                          (newstatus (cadr state-stauses)))
+			    (state-statuses        (db:roll-up-rules state-status-counts state status))
+                          (newstate  (car state-statuses))
+                          (newstatus (cadr state-statuses)))
                        (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
 							(apply conc
                   (map (lambda (x)
                      (conc
                      		(with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
@@ -4022,81 +4138,84 @@
          (if (and test-id state status (equal? status "AUTO")) 
              (db:test-data-rollup dbstruct run-id test-id status))
          tr-res)))))
 
 (define (db:roll-up-rules state-status-counts state status)
-		(let* ((running     (length (filter (lambda (x)
-                          (member (dbr:counts-state x) *common:running-states*))
-                                 state-status-counts)))
-           (bad-not-started      (length (filter (lambda (x)
-                                      (and (equal? (dbr:counts-state x) "NOT_STARTED") 
-                                        (not (member (dbr:counts-status x)  *common:not-started-ok-statuses*))))
-																	state-status-counts)))
-           (all-curr-states      (common:special-sort  ;; worst -> best (sort of)
-                                    (delete-duplicates
-                                      (if (and state (not (member state *common:dont-roll-up-states*)))
-                                          (cons state (map dbr:counts-state state-status-counts))
-                                          (map dbr:counts-state state-status-counts)))
-                                                  *common:std-states* >))
-           (all-curr-statuses    (common:special-sort  ;; worst -> best
-                                    (delete-duplicates
-                                      (if (and state status (not (member state *common:dont-roll-up-states*)))
-                                          (cons status (map dbr:counts-status state-status-counts))
-                                          (map dbr:counts-status state-status-counts)))
-                                                   *common:std-statuses* >))
-           (non-completes        (filter (lambda (x)
-							 										 (not (member x (cons "COMPLETED" *common:dont-roll-up-states*))))
-						       									all-curr-states))
-			     (preq-fails        (filter (lambda (x)
-							 								(equal? x "PREQ_FAIL"))
-						       							all-curr-statuses))
-           (num-non-completes (length non-completes))
- 					 (newstate          (cond
-															((> running 0)           "RUNNING")            ;; anything running, call the situation running
-                              ((> (length preq-fails) 0) "NOT_STARTED")
-															((> bad-not-started 0)   "COMPLETED")          ;; we have an ugly situation, it is completed in the sense we cannot do more.
-															((> num-non-completes 0) (car non-completes))  ;;  (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED
-															(else                    (car all-curr-states))))
-           (newstatus         (cond
-                              ((> (length preq-fails) 0)  "PREQ_FAIL")
-                              ((or (> bad-not-started 0)
-                                   (and (equal? newstate "NOT_STARTED")
-                                      (> num-non-completes 0)))
-                                            "STARTED")
-                              (else (car all-curr-statuses)))))
- 					(debug:print-info 2 *default-log-port*
-                                         "\n--> probe db:set-state-status-and-roll-up-items: "
-                                         "\n--> state-status-counts: "(map dbr:counts->alist state-status-counts)
-                                         "\n--> running:             "running
-                                         "\n--> bad-not-started:     "bad-not-started
-                                         "\n--> non-non-completes:   "num-non-completes
-                                         "\n--> non-completes:       "non-completes
-                                         "\n--> all-curr-states:     "all-curr-states
+  (let* ((running              (length (filter (lambda (x)
+						 (member (dbr:counts-state x) *common:running-states*))
+					       state-status-counts)))
+	 (bad-not-started      (length (filter (lambda (x)
+						 (and (equal? (dbr:counts-state x) "NOT_STARTED") 
+						      (not (member (dbr:counts-status x)  *common:not-started-ok-statuses*))))
+					       state-status-counts)))
+	 (all-curr-states      (common:special-sort  ;; worst -> best (sort of)
+				(delete-duplicates
+				 (if (and state (not (member state *common:dont-roll-up-states*)))
+				     (cons state (map dbr:counts-state state-status-counts))
+				     (map dbr:counts-state state-status-counts)))
+				*common:std-states* >))
+	 (all-curr-statuses    (common:special-sort  ;; worst -> best
+				(delete-duplicates
+				 (if (and state status (not (member state *common:dont-roll-up-states*)))
+				     (cons status (map dbr:counts-status state-status-counts))
+				     (map dbr:counts-status state-status-counts)))
+				*common:std-statuses* >))
+	 (non-completes        (filter (lambda (x)
+					 (not (member x (cons "COMPLETED" *common:dont-roll-up-states*))))
+				       all-curr-states))
+	 (preq-fails        (filter (lambda (x)
+				      (equal? x "PREQ_FAIL"))
+				    all-curr-statuses))
+	 (num-non-completes (length non-completes))
+	 (newstate          (cond
+			     ((> running 0)             "RUNNING")            ;; anything running, call the situation running
+			     ((> (length preq-fails) 0) "NOT_STARTED")
+			     ((> bad-not-started 0)     "COMPLETED")          ;; we have an ugly situation, it is completed in the sense we cannot do more.
+			     ((> num-non-completes 0)   (car non-completes))  ;;  (remove (lambda (x)(equal? "COMPLETED" x)) all-curr-states))) ;; only rollup DELETED if all DELETED
+			     (else                      (car all-curr-states))))
+	 (newstatus         (cond
+			     ((> (length preq-fails) 0)  "PREQ_FAIL")
+			     ((or (> bad-not-started 0)
+				  (and (equal? newstate "NOT_STARTED")
+				       (> num-non-completes 0)))
+			      "STARTED")
+			     (else (car all-curr-statuses)))))
+    (debug:print-info 2 *default-log-port*
+		      "\n--> probe db:set-state-status-and-roll-up-items: "
+		      "\n--> state-status-counts: "(map dbr:counts->alist state-status-counts)
+		      "\n--> running:             "running
+		      "\n--> bad-not-started:     "bad-not-started
+		      "\n--> non-non-completes:   "num-non-completes
+		      "\n--> non-completes:       "non-completes
+		      "\n--> all-curr-states:     "all-curr-states
                                          "\n--> all-curr-statuses:     "all-curr-statuses
                                          "\n--> newstate              "newstate
                                          "\n--> newstatus            "newstatus
                                          "\n\n")
-
-                        ;; NB// Pass the db so it is part of the transaction
-         (list newstate newstatus)))
+    ;; NB// Pass the db so it is part of the transaction
+    (list newstate newstatus)))
 
 (define (db:set-state-status-and-roll-up-run dbstruct run-id curr-state curr-status)
-    (mutex-lock! *db-transaction-mutex*)
+    ;; (mutex-lock! *db-transaction-mutex*)
     (db:with-db
      dbstruct #f #f
      (lambda (db)
        (let ((tr-res
               (sqlite3:with-transaction
                db
                (lambda ()
                    (let* ((state-status-counts  (db:get-all-state-status-counts-for-run dbstruct run-id))
-													(state-stauses (db:roll-up-rules state-status-counts #f #f ))
-                          (newstate (car state-stauses))
-                          (newstatus (cadr state-stauses))) 
+			  (state-statuses        (db:roll-up-rules state-status-counts #f #f ))
+                          (newstate             (if (null? state-statuses)
+						    curr-state
+						    (car state-statuses)))
+			  (newstatus            (if (null? state-statuses)
+						    curr-status
+						    (cadr state-statuses))))
                     (if (or (not (eq? newstate curr-state)) (not (eq?  newstatus curr-status)))
                    (db:set-run-state-status dbstruct run-id newstate newstatus )))))))
-         (mutex-unlock! *db-transaction-mutex*)
+         ;; (mutex-unlock! *db-transaction-mutex*)
          tr-res))))
 
 
 (define (db:get-all-state-status-counts-for-run dbstruct run-id)
  (let* ((test-count-recs (db:with-db
@@ -4148,10 +4267,25 @@
          
          (unrelated-rec-list   
           (filter nonmatch-countrec-lambda other-items-count-recs)))
     
     (cons updated-count-rec unrelated-rec-list)))
+
+;; full count not including toplevel
+;;
+(define (db:get-all-state-status-counts-for-testname dbstruct run-id test-name)
+  (let* ((test-count-recs (db:with-db
+			   dbstruct #f #f
+			   (lambda (db)
+			     (sqlite3:map-row
+			      (lambda (state status count)
+				(make-dbr:counts state: state status: status count: count))
+			      db
+			      "SELECT state,status,count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' GROUP BY state,status;"
+                                     run-id test-name)))))
+    test-count-recs))
+
 
 ;; (define (db:get-all-item-states db run-id test-name)
 ;;   (sqlite3:map-row 
 ;;    (lambda (a) a)
 ;;    db
@@ -4342,18 +4476,19 @@
 			       set-verbosity
 			       killserver
 			       ))
 
 (define (db:login dbstruct calling-path calling-version client-signature)
-  (cond 
+   (cond 
    ((not (equal? calling-path *toppath*))
     (list #f "Login failed due to mismatch paths: " calling-path ", " *toppath*))
    ;; ((not (equal? *run-id* run-id))
    ;;  (list #f "Login failed due to mismatch run-id: " run-id ", " *run-id*))
    ((not (equal? megatest-version calling-version))
     (list #t (conc "Login warning due to mismatch megatest version: " calling-version ", " megatest-version)))
-   (else
+   
+    (else
     (hash-table-set! *logged-in-clients* client-signature (current-seconds))
     '(#t "successful login"))))
 
 (define (db:general-call dbstruct stmtname params)
   (let ((query (let ((q (alist-ref (if (string? stmtname)

Index: dcommon.scm
==================================================================
--- dcommon.scm
+++ dcommon.scm
@@ -713,11 +713,11 @@
 				 ;;           colnames)
 				 (set! rownum 1)
 				 (for-each 
 				  (lambda (server)
 				    (set! colnum 0)
-				    (match-let (((mod-time host port start-time pid)
+				    (match-let (((mod-time host port start-time server-id pid)
 						 server))
 				      (let* ((uptime  (- (current-seconds) mod-time))
 					     (runtime (if start-time
 							  (- mod-time start-time)
 							  0))

Index: docs/manual/megatest_manual.html
==================================================================
--- docs/manual/megatest_manual.html
+++ docs/manual/megatest_manual.html
@@ -773,13 +773,13 @@
 <h1>The Megatest Users Manual</h1>
 <span id="author">Matt Welland</span><br>
 <span id="email" class="monospaced">&lt;<a href="mailto:matt@kiatoa.com">matt@kiatoa.com</a>&gt;</span><br>
 <span id="revnumber">version 1.5,</span>
 <span id="revdate">June 2020</span>
-<div id="toc">
-  <div id="toctitle">Table of Contents</div>
-  <noscript><p><b>JavaScript must be enabled in your browser to display the table of contents.</b></p></noscript>
+<div id="toc">
+  <div id="toctitle">Table of Contents</div>
+  <noscript><p><b>JavaScript must be enabled in your browser to display the table of contents.</b></p></noscript>
 </div>
 </div>
 <div id="content">
 <div class="sect1">
 <h2 id="_preface">Preface</h2>
@@ -959,10 +959,15 @@
 <img src="megatest-system-architecture.png" alt="Static">
 </div>
 </div>
 </div>
 </div>
+</div>
+<div class="sect1">
+<h2 id="_road_map">Road Map</h2>
+<div class="sectionbody">
+</div>
 </div>
 <div class="sect1">
 <h2 id="_todo_road_map">TODO / Road Map</h2>
 <div class="sectionbody">
 <div class="paragraph"><p>Note: This road-map is a wish list and not a formal plan. Items are in
@@ -1447,11 +1452,14 @@
 <h2 id="_installation">Installation</h2>
 <div class="sectionbody">
 <div class="sect2">
 <h3 id="_dependencies">Dependencies</h3>
 <div class="paragraph"><p>Chicken scheme and a number of "eggs" are required for building
-Megatest. In the v1.66 and beyond assistance to create the build
+Megatest. See the script installall.sh in the utils directory of the
+source distribution for an automated way to install everything
+needed for building Megatest on Linux.</p></div>
+<div class="paragraph"><p>Megatest. In the v1.66 and beyond assistance to create the build
 system is built into the Makefile.</p></div>
 <div class="listingblock">
 <div class="title">Installation steps (overview)</div>
 <div class="content monospaced">
 <pre>./configure
@@ -3417,10 +3425,48 @@
 <td class="tableblock halign-left valign-top" ><p class="tableblock monospaced"></p></td>
 </tr>
 </tbody>
 </table>
 </div>
+</div>
+<div class="sect1">
+<h2 id="_test_plan">Test Plan</h2>
+<div class="sectionbody">
+<div class="sect2">
+<h3 id="_tests">Tests</h3>
+<div class="paragraph"><p>itemwait|33</p></div>
+<div class="paragraph"><p>rerun-downstream-item|20</p></div>
+<div class="paragraph"><p>rerunclean|20</p></div>
+<div class="paragraph"><p>fullrun|18</p></div>
+<div class="paragraph"><p>goodtests|18</p></div>
+<div class="paragraph"><p>kill-rerun|17</p></div>
+<div class="paragraph"><p>items-runconfigvars|16</p></div>
+<div class="paragraph"><p>ro_test|16</p></div>
+<div class="paragraph"><p>runconfig-tests|16</p></div>
+<div class="paragraph"><p>env-pollution|13</p></div>
+<div class="paragraph"><p>itemmap|11</p></div>
+<div class="paragraph"><p>testpatt_envvar|10</p></div>
+<div class="paragraph"><p>toprun|10</p></div>
+<div class="paragraph"><p>chained-waiton|8</p></div>
+<div class="paragraph"><p>skip-on-fileexists|8</p></div>
+<div class="paragraph"><p>killrun_preqfail|7</p></div>
+<div class="paragraph"><p>subrun|6</p></div>
+<div class="paragraph"><p>dependencies|5</p></div>
+<div class="paragraph"><p>itemwait-simple|4</p></div>
+<div class="paragraph"><p>rollup|4</p></div>
+<div class="paragraph"><p>end-of-run|3</p></div>
+<div class="paragraph"><p>killrun|3</p></div>
+<div class="paragraph"><p>listener|3</p></div>
+<div class="paragraph"><p>test2|3</p></div>
+<div class="paragraph"><p>testpatt|3</p></div>
+<div class="paragraph"><p>env-pollution-usecacheno|2</p></div>
+<div class="paragraph"><p>set-values|2
+envvars|1
+listruns-tests|1
+subrun-usecases|1</p></div>
+</div>
+</div>
 </div>
 <div class="sect1">
 <h2 id="_megatest_internals">Megatest Internals</h2>
 <div class="sectionbody">
 <div class="imageblock graphviz">
@@ -3438,10 +3484,10 @@
 </div>
 <div id="footnotes"><hr></div>
 <div id="footer">
 <div id="footer-text">
 Version 1.5<br>
-Last updated 2020-06-29 08:09:27 PDT
+Last updated 2020-09-08 08:39:29 PDT
 </div>
 </div>
 </body>
 </html>

Index: docs/manual/megatest_manual.txt
==================================================================
--- docs/manual/megatest_manual.txt
+++ docs/manual/megatest_manual.txt
@@ -116,10 +116,12 @@
 include::writing_tests.txt[]
 
 include::howto.txt[]
 
 include::reference.txt[]
+
+include::testplan.txt[]
 
 Megatest Internals
 ------------------
 
 ["graphviz", "server.png"]

ADDED   docs/manual/testplan.txt
Index: docs/manual/testplan.txt
==================================================================
--- /dev/null
+++ docs/manual/testplan.txt
@@ -0,0 +1,81 @@
+// This file is part of Megatest.
+// 
+//     Megatest is free software: you can redistribute it and/or modify
+//     it under the terms of the GNU General Public License as published by
+//     the Free Software Foundation, either version 3 of the License, or
+//     (at your option) any later version.
+// 
+//     Megatest is distributed in the hope that it will be useful,
+//     but WITHOUT ANY WARRANTY; without even the implied warranty of
+//     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//     GNU General Public License for more details.
+// 
+//     You should have received a copy of the GNU General Public License
+//     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
+
+// Copyright 2006-2020, Matthew Welland.
+
+Test Plan
+---------
+
+Tests
+~~~~~
+
+itemwait|33
+
+rerun-downstream-item|20
+
+rerunclean|20
+
+fullrun|18
+
+goodtests|18
+
+kill-rerun|17
+
+items-runconfigvars|16
+
+ro_test|16
+
+runconfig-tests|16
+
+env-pollution|13
+
+itemmap|11
+
+testpatt_envvar|10
+
+toprun|10
+
+chained-waiton|8
+
+skip-on-fileexists|8
+
+killrun_preqfail|7
+
+subrun|6
+
+dependencies|5
+
+itemwait-simple|4
+
+rollup|4
+
+end-of-run|3
+
+killrun|3
+
+listener|3
+
+test2|3
+
+testpatt|3
+
+env-pollution-usecacheno|2
+
+set-values|2
+envvars|1
+listruns-tests|1
+subrun-usecases|1
+
+

Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ http-transport.scm
@@ -246,12 +246,18 @@
 			   (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info")
 			   (exit 1))))
 	 (res        (vector #f "uninitialized"))
 	 (success    #t)
 	 (sparams    (db:obj->string params transport: 'http))
-	 (runremote  (or area-dat *runremote*)))
-       (debug:print-info 11 *default-log-port* "fullurl=" fullurl ", cmd=" cmd ", params=" params ", run-id=" run-id "\n")
+	 (runremote  (or area-dat *runremote*))
+         (server-id   (if (vector? serverdat) 
+                           (http-transport:server-dat-get-server-id serverdat)
+                           (begin
+			     (debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info")
+			     (exit 1)))))
+       (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) 
+
        ;; set up the http-client here
        (max-retry-attempts 1)
        ;; consider all requests indempotent
        (retry-request? (lambda (request)
 			 #f))
@@ -273,11 +279,11 @@
                                                 (if (debug:debug-mode 1)
                                                     (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...")
                                                     (begin
                                                       (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".")
                                                       (debug:print 0 *default-log-port* " message: " msg ", exn=" exn)
-                                                      (debug:print 0 *default-log-port* " cmd: " cmd " params: " params)
+                                                      (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey"))
                                                       (debug:print 0 *default-log-port* " call-chain: " call-chain)))
                                                 (if runremote
 						    (remote-conndat-set! runremote #f))
 						;; Killing associated server to allow clean retry.")
 						;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
@@ -286,11 +292,11 @@
 					     ;;;          (make-property-condition 'commfail 'message "failed to connect to server")))
 					     ;;; "communications failed"
 						(db:obj->string #f))
 					    (with-input-from-request ;; was dat
 					     fullurl 
-					     (list (cons 'key (or *server-id* "thekey"))
+					     (list (cons 'key (or server-id "thekey"))
 						   (cons 'cmd cmd)
 						   (cons 'params sparams))
 					     read-string))
 					  transport: 'http)
                                          0)) ;; added this speculatively
@@ -305,12 +311,12 @@
 	      (th1 (make-thread send-recieve "with-input-from-request"))
 	      (th2 (make-thread time-out     "time out")))
 	 (thread-start! th1)
 	 (thread-start! th2)
 	 (thread-join! th1)
+          (vector-set! res 0 success)
 	 (thread-terminate! th2)
-	 (debug:print-info 11 *default-log-port* "got res=" res)
 	 (if (vector? res)
 	     (if (vector-ref res 0) ;; this is the first flag or the second flag?
 		 res ;; this is the *inner* vector? seriously? why?
                  (if (debug:debug-mode 11)
                      (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it
@@ -350,11 +356,12 @@
 (define (http-transport:server-dat-get-port          vec)    (vector-ref  vec 1))
 (define (http-transport:server-dat-get-api-uri       vec)    (vector-ref  vec 2))
 (define (http-transport:server-dat-get-api-url       vec)    (vector-ref  vec 3))
 (define (http-transport:server-dat-get-api-req       vec)    (vector-ref  vec 4))
 (define (http-transport:server-dat-get-last-access   vec)    (vector-ref  vec 5))
-(define (http-transport:server-dat-get-socket        vec)    (vector-ref  vec 6))
+;(define (http-transport:server-dat-get-socket        vec)    (vector-ref  vec 6))
+(define (http-transport:server-dat-get-server-id     vec)    (vector-ref  vec 6))
 
 (define (http-transport:server-dat-make-url vec)
   (if (and (http-transport:server-dat-get-iface vec)
 	   (http-transport:server-dat-get-port  vec))
       (conc "http://" 
@@ -371,15 +378,15 @@
 	(debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!"))))
 
 ;;
 ;; connect
 ;;
-(define (http-transport:client-connect iface port)
+(define (http-transport:client-connect iface port server-id) 
   (let* ((api-url      (conc "http://" iface ":" port "/api"))
 	 (api-uri      (uri-reference (conc "http://" iface ":" port "/api")))
 	 (api-req      (make-request method: 'POST uri: api-uri))
-	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds))))
+	 (server-dat   (vector iface port api-uri api-url api-req (current-seconds) server-id)))
     server-dat))
 
 ;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
 ;; used and to shutdown after sometime if it is not.
 ;;
@@ -475,21 +482,27 @@
 	  (let ((new-iface (car sdat))
 		(new-port  (cadr sdat)))
 	    (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info")
 	    (set! iface new-iface)
 	    (set! port  new-port)
-	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds))
+             (if (not *server-id*)
+              (set! *server-id* (server:mk-signature)))
+            (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))
+	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
 	    (flush-output *default-log-port*)))
       
       ;; Transfer *db-last-access* to last-access to use in checking that we are still alive
       (mutex-lock! *heartbeat-mutex*)
       (set! last-access *db-last-access*)
       (mutex-unlock! *heartbeat-mutex*)
       
       (if (common:low-noise-print 120 (conc "server running on " iface ":" port))
 	  (begin
-	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds))
+             (if (not *server-id*)
+              (set! *server-id* (server:mk-signature)))
+            (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))   
+	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
 	    (flush-output *default-log-port*)))
       (if (common:low-noise-print 60 "dbstats")
 	  (begin
 	    (debug:print 0 *default-log-port* "Server stats:")
 	    (db:print-current-query-stats)))

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -211,11 +211,11 @@
 
     (let loop ((minutes   (calc-minutes))
 	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 	       (disk-free (get-df (current-directory)))
                (last-sync (current-seconds)))
-      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
+      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
       (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
              (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                    (delta (abs (- load cpu-load))))
                               (if (> delta 0.1) ;; don't bother updating with small changes
                                   load
@@ -233,11 +233,11 @@
              (test-info   (rmt:get-test-info-by-id run-id test-id))
              (state       (db:test-get-state test-info))
              (status      (db:test-get-status test-info))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
-        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+        ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
          ((test-get-kill-request run-id test-id)
           (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
           (set! kill-job? #t))
          ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
@@ -252,13 +252,13 @@
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
         (launch:handle-zombie-tests run-id)
         (when do-sync
           ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
           ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
-          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
+          ;; (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
           (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
-          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
+          #;(common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
         
 	(if kill-job? 
 	    (begin
               (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
 	      (mutex-lock! m)

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 ;; (declare (unit megatest-version))
 
-(define megatest-version 1.6566)
+(define megatest-version 1.6581)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -852,11 +852,11 @@
       ))
 
 (if (args:get-arg "-ping")
     (let* ((server-id     (string->number (args:get-arg "-ping"))) ;; extract run-id (i.e. no ":"
 	   (host:port     (args:get-arg "-ping")))
-      (server:ping (or server-id host:port) do-exit: #t)))
+      (server:ping (or server-id host:port) #f do-exit: #t)))
 
 ;;======================================================================
 ;; Capture, save and manipulate environments
 ;;======================================================================
 

Index: mt.scm
==================================================================
--- mt.scm
+++ mt.scm
@@ -102,11 +102,11 @@
 		       #f))))
     (if useres
 	(let ((result (vector-ref res 1)))
 	  (debug:print 4 *default-log-port* "Using lazy value res: " result)
 	  result)
-	(let ((newres (rmt:get-prereqs-not-met run-id waitons ref-item-path mode: mode itemmaps: itemmaps)))
+	(let ((newres (rmt:get-prereqs-not-met run-id waitons ref-item-path mode itemmaps)))
 	  (hash-table-set! *pre-reqs-met-cache* key (vector (current-seconds) newres))
 	  newres))))
 
 (define (mt:get-run-stats dbstruct run-id)
 ;;  Get run stats from local access, move this ... but where?

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -100,10 +100,15 @@
     ;; DOT INIT_RUNREMOTE -> MUTEXLOCK;
     ;; ensure we have a record for our connection for given area
     (if (not runremote)                   ;; can remove this one. should never get here.         
 	(begin
 	  (set! *runremote* (make-remote))
+          (let* ((server-info (remote-server-info *runremote*))) 
+            (if server-info
+		(begin
+			(remote-server-url-set! *runremote* (server:record->url server-info))
+			(remote-server-id-set! *runremote* (server:record->id server-info)))))  
 	  (set! runremote   *runremote*))) ;; new runremote will come from this on next iteration
     
     ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
     ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
     ;; DOT SET_HOMEHOST -> MUTEXLOCK;
@@ -174,12 +179,17 @@
      ;;DOT CASE6 -> "rmt:send-receive";
      ;; on homehost and this is a write, we already have a server, but server has died
      ((and (cdr (remote-hh-dat runremote))           ;; on homehost
            (not (member cmd api:read-only-queries))  ;; this is a write
            (remote-server-url runremote)             ;; have a server
-           (not (server:ping (remote-server-url runremote))))  ;; server has died. NOTE: this is not a cheap call! Need better approach.
+           (not (server:ping (remote-server-url runremote) (remote-server-id runremote))))  ;; server has died. NOTE: this is not a cheap call! Need better approach.
       (set! *runremote* (make-remote))
+      (let* ((server-info (remote-server-info *runremote*))) 
+            (if server-info
+		(begin
+		  (remote-server-url-set! *runremote* (server:record->url server-info))
+                  (remote-server-id-set! *runremote* (server:record->id server-info)))))
       (remote-force-server-set! runremote (common:force-server?))
       (mutex-unlock! *rmt-mutex*)
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  6")
       (rmt:send-receive cmd rid params attemptnum: attemptnum))
 
@@ -202,20 +212,22 @@
      ((and (not (remote-force-server runremote))     ;; honor forced use of server, i.e. server NOT required
 	   (cdr (remote-hh-dat runremote))           ;; have homehost
            (not (remote-server-url runremote))       ;; no connection yet
 	   (not (member cmd api:read-only-queries))) ;; not a read-only query
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8")
-      (let ((server-url  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
-	(if server-url
-	    (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed
+      (let ((server-info  (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
+	(if server-info
+	    (begin
+              (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed
+              (remote-server-id-set! runremote (server:record->id server-info)))  
 	    (if (common:force-server?)
 		(server:start-and-wait *toppath*)
-		(server:kind-run *toppath*))))
+		(server:kind-run *toppath*)))
       (remote-force-server-set! runremote (common:force-server?))
       (mutex-unlock! *rmt-mutex*)
       (debug:print-info 12 *default-log-port* "rmt:send-receive, case  8.1")
-      (rmt:open-qry-close-locally cmd 0 params))
+      (rmt:open-qry-close-locally cmd 0 params)))
 
      ;;DOT CASE9 [label="force server\nnot on homehost"];
      ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9};
      ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive";
      ((or (and (remote-force-server runremote)              ;; we are forcing a server and don't yet have a connection to one
@@ -658,11 +670,12 @@
     (apply append 
 	   (map (lambda (run-id)
 		  (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname)))
 	   run-ids))))
 
-(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f))
+;; NOTE: rmt functions can NEVER have key params as they might be called as local
+(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path mode itemmaps) ;; #!key (mode '(normal))(itemmaps #f))
   (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps)))
 
 (define (rmt:get-count-tests-running-for-run-id run-id fastmode)
   (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id fastmode)))
 

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -833,15 +833,15 @@
 ;; tal - list of never visited tests
 ;;   prefer next hed to be from reg than tal.
 
 (define runs:nothing-left-in-queue-count 0)
 
-(define (runs:lazy-get-prereqs-not-met  testdat run-id waitons hed item-path #!optional (mode '(normal))(itemmaps #f)) ;; mode: testmode itemmaps: itemmaps)
+(define (runs:lazy-get-prereqs-not-met  testdat run-id waitons hed item-path #!key (mode '(normal))(itemmaps #f)) ;; mode: testmode itemmaps: itemmaps)
   (if (and (runs:testdat-prereqs-not-met testdat)
 	   (< (- (current-seconds) (runs:testdat-last-update testdat)) 10)) ;; only refresh for this test if it has been at least 10 seconds
       (runs:testdat-prereqs-not-met testdat)
-      (let* ((res (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: mode itemmaps: itemmaps)))
+      (let* ((res (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode itemmaps)))
 		    (if (list? res)
 			res
 			(begin
 			  (debug:print 0 *default-log-port*
 				       "ERROR: rmt:get-prereqs-not-met returned non-list!\n"
@@ -862,20 +862,11 @@
 ;;	         (reruns      '()))
 (define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record
 			   can-run-more items runname tconfig reglen test-registry test-records itemmaps testdat)
   (let* ((loop-list       (list hed tal reg reruns))
 	 (prereqs-not-met (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
-	  #;(let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
-			    (if (list? res)
-				res
-				(begin
-				  (debug:print 0 *default-log-port*
-					       "ERROR: rmt:get-prereqs-not-met returned non-list!\n"
-					       "  res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps)
-				  '())))
 	 (have-itemized   (not (null? (lset-intersection eq? testmode '(itemmatch itemwait)))))
-	 ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
 	 (fails           (runs:calc-fails prereqs-not-met))
 	 (prereq-fails    (runs:calc-prereq-fail prereqs-not-met))
 	 (non-completed   (runs:calc-not-completed prereqs-not-met))
 	 (runnables       (runs:calc-runnable prereqs-not-met))
          (unexpanded-prereqs
@@ -1117,12 +1108,10 @@
 	 (have-resources         (car run-limits-info))
 	 (num-running            (list-ref run-limits-info 1))
 	 (num-running-in-jobgroup(list-ref run-limits-info 2)) 
 	 (max-concurrent-jobs    (list-ref run-limits-info 3))
 	 (job-group-limit        (list-ref run-limits-info 4))
-	 ;; (prereqs-not-met        (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
-	 ;; (prereqs-not-met         (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
 	 (fails                  (if (list? prereqs-not-met) ;; TODO: rename fails to failed-prereqs
 				      (runs:calc-fails prereqs-not-met)
 				      (begin
 					(debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met)
 					'())))
@@ -1518,11 +1507,10 @@
                    keyvals: keyvals
                    run-info: run-info
                    ;; newtal: newtal
                    all-tests-registry: all-tests-registry
                    ;; itemmaps: itemmaps
-                   ;; prereqs-not-met: (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)
                    ;; can-run-more-tests: (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs) ;; look at the test jobgroup and tot jobs running
                    )))
 
     ;; Initialize the test-registery hash with tests that already have a record
     ;; convert state to symbol and use that as the hash value

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -93,10 +93,11 @@
 (define (server:mk-signature)
   (message-digest-string (md5-primitive) 
 			 (with-output-to-string
 			   (lambda ()
 			     (write (list (current-directory)
+                                          (current-process-id)
 					  (argv)))))))
 
 ;; When using zmq this would send the message back (two step process)
 ;; with spiffy or rpc this simply returns the return data to be returned
 ;; 
@@ -163,18 +164,19 @@
     (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
     (thread-join! log-rotate)
     (pop-directory)))
 
 ;; given a path to a server log return: host port startseconds
-;;
+;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 
+
 (define (server:logf-get-start-info logf)
-  (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs
+  (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)"))) ;; SERVER STARTED: host:port AT timesecs server id
     (handle-exceptions
 	exn
       (begin
 	(print "failed to get server info from " logf ", exn=" exn)
-	(list #f #f #f)) ;; no idea what went wrong, call it a bad server
+	(list #f #f #f #f)) ;; no idea what went wrong, call it a bad server
       (with-input-from-file
 	  logf
 	(lambda ()
 	  (let loop ((inl  (read-line))
 		     (lnum 0))
@@ -181,16 +183,17 @@
 	    (if (not (eof-object? inl))
 		(let ((mlst (string-match rx inl)))
 		  (if (not mlst)
 		      (if (< lnum 500) ;; give up if more than 500 lines of server log read
 			  (loop (read-line)(+ lnum 1))
-			  (list #f #f #f))
+			  (list #f #f #f #f))
 		      (let ((dat  (cdr mlst)))
 			(list (car dat) ;; host
 			      (string->number (cadr dat)) ;; port
-			      (string->number (caddr dat))))))
-		(list #f #f #f))))))))
+			      (string->number (caddr dat))
+                              (cadr (cddr dat))))))
+		(list #f #f #f #f))))))))
 
 ;; get a list of servers with all relevant data
 ;; ( mod-time host port start-time pid )
 ;;
 (define (server:get-list areapath #!key (limit #f))
@@ -217,11 +220,11 @@
 			 (tal  (cdr server-logs))
 			 (res '()))
 		(let* ((mod-time  (handle-exceptions
 				   exn
 				   (begin
-				     (print "failed to get modification time on " hed ", exn=" exn)
+				     (debug:print 0 *default-log-port* "failed to get modification time on " hed ", exn=" exn)
 				     (current-seconds)) ;; 0
 				   (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
 		       (down-time (- (current-seconds) mod-time))
 		       (serv-dat  (if (or (< num-serv-logs 10)
 				  	  (< down-time 900)) ;; day-seconds))
@@ -230,11 +233,11 @@
 		       (serv-rec (cons mod-time serv-dat))
 		       (fmatch   (string-match fname-rx hed))
 		       (pid      (if fmatch (string->number (list-ref fmatch 2)) #f))
 		       (new-res  (if (null? serv-dat)
 				     res
-				     (cons (append serv-rec (list pid)) res))))
+				     (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let 
 		  (if (null? tal)
 		      (if (and limit
 			       (> (length new-res) limit))
 			  new-res ;; (take new-res limit)  <= need intelligent sorting before this will work
 			  new-res)
@@ -242,11 +245,11 @@
 
 (define (server:get-num-alive srvlst)
   (let ((num-alive 0))
     (for-each
      (lambda (server)
-       (match-let (((mod-time host port start-time pid)
+       (match-let (((mod-time host port start-time server-id pid)
 		    server))
 	 (let* ((uptime  (- (current-seconds) mod-time))
 		(runtime (if start-time
 			     (- mod-time start-time)
 			     0)))
@@ -305,13 +308,19 @@
 	(let* ((len (length srvrs))
 	       (idx (random len)))
 	  (list-ref srvrs idx))
 	#f)))
 
+(define (server:record->id servr)
+  (match-let (((mod-time host port start-time server-id pid)
+	       servr))
+    (if server-id
+	server-id
+	#f)))
 
 (define (server:record->url servr)
-  (match-let (((mod-time host port start-time pid)
+  (match-let (((mod-time host port start-time server-id pid)
 	       servr))
     (if (and host port)
 	(conc host ":" port)
 	#f)))
 
@@ -378,15 +387,15 @@
 
 ;; this one seems to be the general entry point
 ;;
 (define (server:start-and-wait areapath #!key (timeout 60))
   (let ((give-up-time (+ (current-seconds) timeout)))
-    (let loop ((server-url (server:check-if-running areapath))
+    (let loop ((server-info (server:check-if-running areapath))
 	       (try-num    0))
-      (if (or server-url
+      (if (or server-info
 	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
-	  server-url
+	  (server:record->url server-info)
 	  (let ((num-ok (length (server:get-best (server:get-list areapath)))))
 	    (if (and (> try-num 0)  ;; first time through simply wait a little while then try again
 		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
 		(server:kind-run areapath))
 	    (thread-sleep! 5)
@@ -403,11 +412,10 @@
 ;; no longer care if multiple servers are started by accident. older servers will drop off in time.
 ;;
 (define (server:check-if-running areapath) ;;  #!key (numservers "2"))
   (let* ((ns            (server:get-num-servers))
 	 (servers       (server:get-best (server:get-list areapath))))
-    ;; (print "servers: " servers " ns: " ns)
     (if (or (and servers
 		 (null? servers))
 	    (not servers)
 	    (and (list? servers)
 		 (< (length servers) (random ns)))) ;; somewhere between 0 and numservers
@@ -414,21 +422,22 @@
         #f
         (let loop ((hed (car servers))
                    (tal (cdr servers)))
           (let ((res (server:check-server hed)))
             (if res
-                res
+                hed
                 (if (null? tal)
                     #f
                     (loop (car tal)(cdr tal)))))))))
 
 ;; ping the given server
 ;;
 (define (server:check-server server-record)
   (let* ((server-url (server:record->url server-record))
+         (server-id (server:record->id server-record)) 
          (res        (case *transport-type*
-                       ((http)(server:ping server-url))
+                       ((http)(server:ping server-url server-id))
                        ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
                        )))
     (if res
         server-url
 	#f)))
@@ -441,11 +450,11 @@
 ;; called in megatest.scm, host-port is string hostname:port
 ;;
 ;; NOTE: This is NOT called directly from clients as not all transports support a client running
 ;;       in the same process as the server.
 ;;
-(define (server:ping host-port-in #!key (do-exit #f))
+(define (server:ping host-port-in server-id #!key (do-exit #f))
   (let ((host:port (if (not host-port-in) ;; use read-dotserver to find
 		       #f ;; (server:check-if-running *toppath*)
 		;; (if (number? host-port-in) ;; we were handed a server-id
 		;; 	   (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
 		;; 	     ;; (print "srec: " srec " host-port-in: " host-port-in)
@@ -467,11 +476,11 @@
 		(debug:print 0 *default-log-port*  "ERROR: bad host:port"))
 	    (if do-exit (exit 1))
 	    #f)
 	  (let* ((iface      (car host-port))
 		 (port       (cadr host-port))
-		 (server-dat (http-transport:client-connect iface port))
+		 (server-dat (http-transport:client-connect iface port server-id))
 		 (login-res  (rmt:login-no-auto-client-setup server-dat)))
 	    (if (and (list? login-res)
 		     (car login-res))
 		(begin
 		  ;; (print "LOGIN_OK")
@@ -585,16 +594,22 @@
                                (if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
                                    (common:snapshot-file mtdbfile subdir: ".db-snapshot"))
 		               (delete-file* staging-file)
 		               (let* ((start-time (current-milliseconds))
                                       (res (system sync-cmd))
+                                      (dbbackupfile (conc mtdbfile ".backup"))
                                       (res2 
                                        (cond
-                                        ((eq? 0 res)
-		                         (delete-file* (conc mtdbfile ".backup"))
+                                        ((eq? 0 res )
+                                         (handle-exceptions
+                                            exn
+                                            #f
+                                         (if (file-exists? dbbackupfile)
+		                           (delete-file* dbbackupfile)
+                                         )
                                          (if (eq? 0 (file-size sync-log))
-                                             (delete-file sync-log))
+                                             (delete-file* sync-log))
 		                         (system (conc "/bin/mv " staging-file " " mtdbfile))
                                          
                                          (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))
                                          (set! off-time (calculate-off-time
                                                          last-sync-seconds
@@ -605,11 +620,11 @@
                                                            (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid.  Should be a number between 0 and 1, but "duty-cycle" was specified.  Using default value: "default-duty-cycle)
                                                            default-duty-cycle))))
                                          
                                          (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec")
                                          (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time)
-                                         'sync-completed)
+                                         'sync-completed))
                                         (else
                                          (system (conc "/bin/cp "sync-log" "sync-log".fail"))
                                          (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail")
                                          (if (file-exists? (conc mtdbfile ".backup"))
                                              (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile)))

Index: sretrieve.scm
==================================================================
--- sretrieve.scm
+++ sretrieve.scm
@@ -637,21 +637,21 @@
                                       (conc(pathname-file target-path) "." (pathname-extension target-path))
                                       (pathname-file target-path)))
                   (curr-dir (current-directory))   
                   (start-dir (conc (current-directory) "/" last-dir-name))
                   (execlude (make-exclude-pattern (string-split restrictions ",")))
-                   (tmpfile (conc "/tmp/" (current-user-name) "/my-pipe-" (current-process-id))))
+                   (tmpfile (conc "/tmp/my-pipe-" (current-process-id))))
                     (if  (file-exists? start-dir)
                     (begin
                          (sauth:print-error (conclast-dir-name " already exist in your work dir."))
                          (sauth:print-error  "Nothing has been retrieved!!  "))
                      (begin
                    ;    (sretrieve:do-as-calling-user
                    ; (lambda ()
-                    
-                  (if (not (file-exists?  (conc "/tmp/" (current-user-name)))) 
-		      (create-directory (conc "/tmp/" (current-user-name)) #t))
+                  ; (print tmpfile) 
+                  ;(if (not (file-exists?  (conc "/tmp/" (current-user-name)))) 
+		  ;    (create-directory (conc "/tmp/" (current-user-name)) #t))
                           (change-directory parent-dir)
                             (create-fifo  tmpfile)
                                   (process-fork 
     				   (lambda()
                                        (sleep 1) 

Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -755,30 +755,31 @@
 	       (owner      (db:get-value-by-header row header "owner"))
 	       (event-time (db:get-value-by-header row header "event_time"))
 	       (comment    (db:get-value-by-header row header "comment"))
 	       (fail-count (db:get-value-by-header row header "fail_count"))
 	       (pass-count (db:get-value-by-header row header "pass_count"))
-         (db-contour (db:get-value-by-header row header "contour"))
+               (db-contour (db:get-value-by-header row header "contour"))
 	       (contour    (if (args:get-arg "-prepend-contour") 
                                  (if (and db-contour (not (equal? db-contour ""))  (string? db-contour )) 
                                            (begin 
-                                            (debug:print-info 1 *default-log-port*  "db-contour") 
+                                            (debug:print-info 10 *default-log-port*  "db-contour" db-contour) 
  						db-contour)
 					    (args:get-arg "-contour"))))
-         (run-tag (if (args:get-arg "-run-tag")
+               (run-tag (if (args:get-arg "-run-tag")
                             (args:get-arg "-run-tag")
 									""))
-         (last-update (db:get-value-by-header row header "last_update"))
+               (last-update (db:get-value-by-header row header "last_update"))
 	       (keytarg    (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target"))
 	       			(conc "MT_CONTOUR/MT_AREA/" (string-intersperse (rmt:get-keys) "/")) (string-intersperse (rmt:get-keys) "/"))) ;; e.g. version/iteration/platform
+               (base-target      (rmt:get-target run-id))
 	       (target     (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) 
-	       			(conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) (rmt:get-target run-id)) (rmt:get-target run-id)))                 ;; e.g. v1.63/a3e1/ubuntu
+	       			(conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) base-target) base-target))                 ;; e.g. v1.63/a3e1/ubuntu
 	       (spec-id    (pgdb:get-ttype dbh keytarg))
 	       (publish-time (if (args:get-arg "-cp-eventtime-to-publishtime")
                             event-time
                            (current-seconds))) 
-	       (new-run-id (pgdb:get-run-id dbh spec-id target run-name area-id)))
+	       (new-run-id (if (and run-name base-target) (pgdb:get-run-id dbh spec-id target run-name area-id) #f)))
          (if new-run-id
 	         (begin ;; let ((run-record (pgdb:get-run-info dbh new-run-id))
 		        (hash-table-set! runs-ht run-id new-run-id)
 		;; ensure key fields are up to date
      ;; if last_update == pgdb_last_update do not update smallest-last-update-time  
@@ -788,16 +789,16 @@
         (hash-table-set! smallest-last-update-time "smallest-time" last-update)))
 		(pgdb:refresh-run-info
 		 dbh
 		 new-run-id
 		 state status owner event-time comment fail-count pass-count area-id last-update publish-time)
-     (debug:print-info 0 *default-log-port* "Working on run-id " run-id " pgdb-id "  new-run-id )
+     (debug:print-info 4 *default-log-port* "Working on run-id " run-id " pgdb-id "  new-run-id )
      (if (not (equal? run-tag ""))
       (task:add-run-tag dbh new-run-id run-tag))
 		new-run-id) 
       
-	      (if (equal? state "deleted")
+	      (if (or (not state) (equal? state "deleted"))
           (begin 
           (debug:print-info 1 *default-log-port*  "Warning: Run with id " run-id " was created after previous sync and deleted before the sync") #f)
           (if (handle-exceptions
 		        exn
 		        (begin (print-call-chain)
@@ -860,17 +861,17 @@
       (begin  
         (if pgdb-test-id
            (begin 
                 (if  pgdb-step-id
                    (begin
-                    (debug:print-info 1 *default-log-port*  "Updating existing test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id " pgdb step id " pgdb-step-id )
+                    (debug:print-info 4 *default-log-port*  "Updating existing test-step with test-id: " test-id " and step-id " step-id " pgdb test id: " pgdb-test-id " pgdb step id " pgdb-step-id )
 										(let* ((pgdb-last-update (pgdb:get-test-step-last-update dbh pgdb-step-id)))
          (if (and  (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time)))
         (hash-table-set! smallest-last-update-time "smallest-time" last-update))) 
                     (pgdb:update-test-step dbh pgdb-step-id pgdb-test-id stepname state status event_time comment logfile last-update))
                     (begin
- 		      (debug:print-info 1 *default-log-port*  "Inserting test-step with test-id: " test-id " and step-id " step-id  " pgdb test id: " pgdb-test-id)
+ 		      (debug:print-info 4 *default-log-port*  "Inserting test-step with test-id: " test-id " and step-id " step-id  " pgdb test id: " pgdb-test-id)
                      (if (or (not smallest-time) (< last-update smallest-time))
         				      (hash-table-set! smallest-last-update-time "smallest-time" last-update))
                       (pgdb:insert-test-step dbh pgdb-test-id stepname state status event_time comment logfile last-update )
                       (set! pgdb-step-id  (pgdb:get-test-step-id dbh pgdb-test-id stepname state))))
                 (hash-table-set! step-ht step-id pgdb-step-id ))
@@ -906,17 +907,17 @@
       (begin
         (if pgdb-test-id
            (begin 
                 (if  pgdb-data-id
                    (begin
-                    (debug:print-info 1 *default-log-port*  "Updating existing test-data with test-id: " test-id " and  data-id " data-id " pgdb test id: " pgdb-test-id " pgdb data id " pgdb-data-id)
+                    (debug:print-info 4 *default-log-port*  "Updating existing test-data with test-id: " test-id " and  data-id " data-id " pgdb test id: " pgdb-test-id " pgdb data id " pgdb-data-id)
                     (let* ((pgdb-last-update (pgdb:get-test-data-last-update dbh pgdb-data-id)))
          (if (and  (>  last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time)))
         (hash-table-set! smallest-last-update-time "smallest-time" last-update))) 
                     (pgdb:update-test-data dbh pgdb-data-id pgdb-test-id  category variable value expected tol units comment status type last-update))
                     (begin
- 		      (debug:print-info 1 *default-log-port*  "Inserting test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id)
+ 		      (debug:print-info 4 *default-log-port*  "Inserting test-data with test-id: " test-id " and data-id " data-id " pgdb test id: " pgdb-test-id)
                        (if (handle-exceptions
 		      exn
 		      (begin (print-call-chain)
                               (print ((condition-property-accessor 'exn 'message) exn))     
 			#f)
@@ -970,21 +971,23 @@
                                  (pgdb:get-test-id dbh pgdb-run-id test-name item-path))
                                  #f)))
 	 ;; "id"           "run_id"        "testname"  "state"      "status"      "event_time"
 	 ;; "host"         "cpuload"       "diskfree"  "uname"      "rundir"      "item_path"
 	 ;; "run_duration" "final_logf"    "comment"   "shortdir"   "attemptnum"  "archived"
+         (if (or (not item-path) (string-null? item-path))
+             (debug:print-info 0 *default-log-port* "Working on Run id : " run-id "and test name : " test-name)) 
          (if pgdb-run-id
            (begin
 	   (if pgdb-test-id ;; have a record
 	     (begin ;; let ((key-name (conc run-id "/" test-name "/" item-path)))
-	       (debug:print-info 0 *default-log-port*  "Updating existing test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id "  pgdb-test-id "  pgdb-test-id)
+	       (debug:print-info 4 *default-log-port*  "Updating existing test with run-id: " run-id " and test-id: " test-id " pgdb run id: " pgdb-run-id "  pgdb-test-id "  pgdb-test-id)
          (let* ((pgdb-last-update (pgdb:get-test-last-update dbh pgdb-test-id)))
          (if (and  (>  last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) ;;if last-update is same as pgdb-last-update then it is safe to assume the records are identical and we can use a larger last update time.
         (hash-table-set! smallest-last-update-time "smallest-time" last-update))) 
 	       (pgdb:update-test dbh pgdb-test-id pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid))
 	     (begin 
-           (debug:print-info 0 *default-log-port*  "Inserting test with run-id: " run-id " and test-id: " test-id  " pgdb run id: " pgdb-run-id)
+           (debug:print-info 4 *default-log-port*  "Inserting test with run-id: " run-id " and test-id: " test-id  " pgdb run id: " pgdb-run-id)
            (pgdb:insert-test dbh pgdb-run-id test-name item-path state status host cpuload diskfree uname run-dir log-file run-duration comment event-time archived last-update pid)
             (if (or (not smallest-time) (< last-update smallest-time))
         				(hash-table-set! smallest-last-update-time "smallest-time" last-update))
            (set! pgdb-test-id (pgdb:get-test-id dbh pgdb-run-id test-name item-path))))
            (hash-table-set! test-ht test-id pgdb-test-id))
@@ -1013,11 +1016,11 @@
 	   (pgdb:insert-area-tag  dbh   (vector-ref tag-info 0)  (vector-ref area-info 0))))))
 
 (define (tasks:sync-run-data dbh cached-info run-ids area-info smallest-last-update-time) 
   (for-each
      (lambda (run-id)
-      (debug:print-info 1 *default-log-port*   "Check if run with " run-id " needs to be synced" )
+      (debug:print-info 4 *default-log-port*   "Check if run with " run-id " needs to be synced" )
        (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time))
 run-ids))
 
 
 ;; get runs changed since last sync

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -21,11 +21,11 @@
 ;;======================================================================
 ;; Tests
 ;;======================================================================
 
 (declare (unit tests))
-(declare (uses lock-queue))
+;;(declare (uses lock-queue))
 (declare (uses db))
 (declare (uses tdb))
 (declare (uses common))
 ;; (declare (uses dcommon)) ;; needed for the steps processing
 (declare (uses items))