Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,17 @@
 #     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
 
 TODO
 ====
 
+WW38
+. Add test_rundat to no-sync ==> correction, put in <testdir>/.meta/test-run.dat
+. Add STATE/STATUS transitions to .meta/test-run.dat or similar
+. Swizzle update-test-rundat to operate on no-sync
+. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync
+. On state/status change update tests table with duration
+
 WW15
 . fill newview matrix with data, filter pipeline gui elements
 . improve [script], especially indent handling
 
 WW16
@@ -35,11 +42,10 @@
 . break command line into sections; all, run control, queries, utilities etc.
 . pull in ftfplan (not integrated, just code pulled in)
 
 WW20
 . ./configure => ubuntu, sles11, sles12, rh7
-. Jenkins junit XML support
 . Add output flushing in teamcity support
 . Switch to using simple runs query everywhere
 . Add end_time to runs and add a rollup call that sets state, status and end_time
 
 Future

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -570,11 +570,11 @@
 	       (let* ((fullname (conc "logs/" file)))
 		 (if (directory? fullname)
 		     (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
 		     (handle-exceptions
 		      exn
-		      (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
+		      (debug:print-info 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
 		      (delete-file* fullname)))))
 	     files)
 	    (debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))
   
 ;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
@@ -1315,25 +1315,32 @@
       (debug:print-info 0 *default-log-port* "using testpatt " args-testpatt " rtestpatt:" rtestpatt)
       args-testpatt))))
 
 
 
-(define (common:false-on-exception thunk #!key (message #f))
-  (handle-exceptions exn
+;; 
+(define (common:false-on-exception thunk #!key (message #f)(tries 1))
+  (handle-exceptions
+      exn
                      (begin
                        (if message
-                           (debug:print-info 0 *default-log-port* message))
-                       #f) (thunk) ))
-
-(define (common:file-exists? path-string #!key (silent #f))
-  ;; this avoids stack dumps in the case where 
-
-  ;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg:  system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q...
-  (common:false-on-exception (lambda () (file-exists? path-string))
+          (debug:print-info 0 *default-log-port* message " exn=" exn))
+      (if (> tries 1)
+	  (begin
+	    (thread-sleep! 1)
+	    (common:false-on-exception thunk message: message tries: (- tries 1)))
+	  #f))
+    (thunk)))
+
+(define (common:file-exists? path-string #!key (silent #f)(tries 1))
+  ;; this avoids stack dumps in the case where NFS is slow or flakey
+  (common:false-on-exception
+   (lambda ()(file-exists? path-string))
                              message: (if (not silent)
                                           (conc "Unable to access path: " path-string)
                                           #f)
+   tries: tries
                              ))
 
 (define (common:directory-exists? path-string)
   ;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg:  system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q...
   (common:false-on-exception (lambda () (directory-exists? path-string))

Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -212,10 +212,12 @@
   (hash-table-set!
    (dboard:commondat-tabdats commondat)
    tabnum
    tabdat))
 
+(define *updater-running* #f) ;; move this into one of the stucts
+
 ;; gets and calls updater list based on curr-tab-num
 ;;
 (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
   (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
       (let* ((tnum     (or tab-num (dboard:commondat-curr-tab-num commondat)))
@@ -240,11 +242,13 @@
 		     tnum
 		     (cons updater curr-updaters))))
 
 ;; data for each specific tab goes here
 ;;
-(defstruct dboard:tabdat 
+(defstruct dboard:tabdat
+  ;; flags
+  ((already-running #f)                  : boolean)     ;; the updater is already running. skip
   ;; runs
   ((allruns         '())                 : list)        ;; list of dboard:rundat records
   ((allruns-by-id    (make-hash-table))  : hash-table)  ;; hash of run-id -> dboard:rundat records
   ((done-runs       '())                 : list)        ;; list of runs already drawn
   ((not-done-runs   '())                 : list)        ;; list of runs not yet drawn
@@ -645,11 +649,11 @@
 ;;
 (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)
   (let* ((start-time   (current-seconds))
 	 (access-mode  (dboard:tabdat-access-mode tabdat))
          (num-to-get   (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
-                                           "200")))
+                                           "50"))) ;; was 200, which is fine in a normal run area.
 	 (states       (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
 	 (statuses     (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
          (do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
          (do-not-use-query-timestamps   #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
 	 (sort-info    (get-curr-sort))
@@ -716,10 +720,11 @@
     (if got-all
 	(begin
 	  (dboard:rundat-last-update-set!     run-dat (- start-time 2))
 	  (dboard:rundat-run-data-offset-set! run-dat 0))
 	(begin
+	  ;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help
 	  (dboard:rundat-run-data-offset-set! run-dat
 					      (+ num-to-get (dboard:rundat-run-data-offset run-dat)))))
 
     (for-each 
      (lambda (tdat)
@@ -833,11 +838,13 @@
 ;;
 ;; create a virtual table of all the tests
 ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") )
 ;;
 (define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts)
-  (let* ((access-mode      (dboard:tabdat-access-mode tabdat))
+  (dboard:tabdat-already-running-set! tabdat #t)
+  (let* (;; (already-running  (dboard:tabdat-already-running tabdat))
+	 (access-mode      (dboard:tabdat-access-mode tabdat))
          (keys             (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys)))
 	 (last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2))
          (allruns          (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts))
          ;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f))
          (allruns-tree    (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname")
@@ -901,27 +908,26 @@
 		  (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
 		  (hash-table-set!    (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
 	      (if (or (null? tal)
 		      (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
 		  (begin
-		    (when (> elapsed-time 2)   
+		    #;(when (> elapsed-time 2)   
                       (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
                       (let* ((old-val (iup:attribute *tim* "TIME"))
                              (new-val (number->string (inexact->exact (floor (* 2  (string->number old-val)))))))
                         (if (< (string->number new-val) 5000)
-                            ((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
-                            (iup:attribute-set! *tim* "TIME" new-val))))
-
-
-                      )
+                            (begin
+			      (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
+			      (iup:attribute-set! *tim* "TIME" new-val)))))
 		    (dboard:tabdat-allruns-set! tabdat new-res)
 		    maxtests)
 		  (if (> (dboard:rundat-run-data-offset run-struct) 0)
 		      (loop run tal new-res newmaxtests) ;; not done getting data for this run
 		      (loop (car tal)(cdr tal) new-res newmaxtests)))))))
     (dboard:tabdat-filters-changed-set! tabdat #f)
-    (dboard:update-tree tabdat runs-hash header tb)))
+    (dboard:update-tree tabdat runs-hash header tb)
+    (dboard:tabdat-already-running-set! tabdat #f)))
 
 (define *collapsed* (make-hash-table))
 
 (define (toggle-hide lnum uidat) ; fulltestname)
   (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum))
@@ -2505,14 +2511,10 @@
 
         ;; insert extra widget here
         (if extra-widget
             extra-widget
             (iup:hbox)) ;; empty widget
-
-        
-
-        
         )))
 
      (let* ((status-toggles (map (lambda (status)
 				   (iup:toggle (conc status)
 					       #:fontsize 8 ;; btn-fontsz ;; "10"
@@ -3725,28 +3727,36 @@
 ;; (define (tabdat-values tabdat)
 
 ;; runs update-rundat using the various filters from the gui
 ;;
 (define (dashboard:do-update-rundat tabdat)
+  ;; this seems like a good place to check for already running and skip if so
+  ;;
+  ;; (set! *updater-running* #t)
+;;(if (dboard:tabdat-already-running tabdat)
+;;    (begin
+;;	(debug:print-info 0 *default-log-port* "Dashboard overloaded - updates will be slow, skipping update.")
+;;	(dboard:tabdat-target tabdat))
   (dboard:update-rundat
    tabdat
    (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%")
    (dboard:tabdat-numruns tabdat)
    (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%")
    ;; generate key patterns from the target stored in tabdat
    (let* ((dbkeys (dboard:tabdat-dbkeys tabdat)))
      (let ((fres   (if (dboard:tabdat-target tabdat)
-                       (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
-                         (map (lambda (k v)(list k v)) dbkeys ptparts))
-                       (let ((res '()))
-                         (for-each (lambda (key)
-                                     (if (not (equal? key "runname"))
-                                         (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
-                                           (if val (set! res (cons (list key val) res))))))
-                                   dbkeys)
-                         res))))
-       fres))))
+		       (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
+			 (map (lambda (k v)(list k v)) dbkeys ptparts))
+		       (let ((res '()))
+			 (for-each (lambda (key)
+				     (if (not (equal? key "runname"))
+					 (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
+					   (if val (set! res (cons (list key val) res))))))
+				   dbkeys)
+			 res))))
+       fres)))
+  #;(set! *updater-running* #f))
 
 (define (dashboard:runs-tab-updater commondat tab-num)
   (debug:catch-and-dump 
    (lambda ()
      (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))
@@ -3801,22 +3811,25 @@
 	   (dashboard:runs-tab-updater commondat 1))
 	 tab-num: 2)
 	(iup:callback-set! *tim*
 			   "ACTION_CB"
 			   (lambda (time-obj)
-			     (let ((update-is-running #f))
-			     (mutex-lock! (dboard:commondat-update-mutex commondat))
-			     (set! update-is-running (dboard:commondat-updating commondat))
-			     (if (not update-is-running)
-			     (dboard:commondat-updating-set! commondat #t))
-			     (mutex-unlock! (dboard:commondat-update-mutex commondat))
-			     (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
-			     (begin
-			     (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
-			     (mutex-lock! (dboard:commondat-update-mutex commondat))
-			     (dboard:commondat-updating-set! commondat #f)
-			     (mutex-unlock! (dboard:commondat-update-mutex commondat)))
+			     (if (not *updater-running*)
+				 (begin
+				   ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+				   ;; (set! update-is-running (dboard:commondat-updating commondat))
+				   ;;(if (not update-is-running)
+				   ;;	   (dboard:commondat-updating-set! commondat #t))
+				   ;;(mutex-unlock! (dboard:commondat-update-mutex commondat))
+				   ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
+				   ;;	   (begin
+				   (set! *updater-running* #t)
+				   (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
+				   (set! *updater-running* #f)
+				   ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+				   ;;  (dboard:commondat-updating-set! commondat #f)
+				   ;;   (mutex-unlock! (dboard:commondat-update-mutex commondat)))
 				   ))
 			     1))))
       
       (let ((th1 (make-thread (lambda ()
 				(thread-sleep! 1)

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1776,15 +1776,33 @@
           #f
           )
         (with-input-from-file infile read-lines)
 	)))
 
-;;  select end_time-now from
-;;      (select testname,item_path,event_time+run_duration as
-;;                          end_time,strftime('%s','now') as now from tests where state in
-;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
-
+;; check duration against test-run.dat file if it exists and update the value in
+;; the db if necessary
+;;
+(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
+  (let* ((datf             (conc run-dir ".mt_data/test-run.dat"))
+	 (modt             (if (and (file-exists? datf)
+				    (file-read-access? datf))
+			       (file-modification-time datf)
+			       #f)) ;; (+ event-time run-duration))))
+	 (alt-run-duration (if modt
+			       (- modt event-time)
+			       #f)))
+    (if (and alt-run-duration
+	     (> alt-run-duration run-duration))
+	(begin
+	  (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
+	  (db:with-db
+	   dbstruct #f #f
+	   (lambda (db)
+	     (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
+	     #t)))
+	#f))) ;; #f = we did NOT adjust the time
+	      
 (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
   (let* ((incompleted '())
 	 (oldlaunched '())
 	 (toplevels   '())
           ;; The default running-deadtime is 720 seconds = 12 minutes.
@@ -1828,37 +1846,39 @@
 	 ;;                     (db:test-get-run_duration testdat)))
 	 ;;                    600) 
 	 ;; (db:delay-if-busy dbdat)
 	 (sqlite3:for-each-row 
 	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
-	    (if (and (equal? uname "n/a")
-		     (equal? item-path "")) ;; this is a toplevel test
-		;; what to do with toplevel? call rollup?
-		(begin
-		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-		(begin
-		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
-		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
-				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
-				    " event-time="event-time" run-duration="run-duration))))
+	    (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+		(if (and (equal? uname "n/a")
+			 (equal? item-path "")) ;; this is a toplevel test
+		    ;; what to do with toplevel? call rollup?
+		    (begin
+		      (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+		      (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+		    (begin
+		      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
+		      (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
+					test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
+					" event-time="event-time" run-duration="run-duration)))))
 	  stmth1
 	  run-id running-deadtime) ;; default time 720 seconds
-       
+	    
 	 (sqlite3:for-each-row 
 	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
-	    (if (and (equal? uname "n/a")
-		     (equal? item-path "")) ;; this is a toplevel test
-		;; what to do with toplevel? call rollup?
-		(begin
-		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
-		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
-		(begin
-		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
-				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
-				    " run-duration="run-duration)
-		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
+	    (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+		(if (and (equal? uname "n/a")
+			 (equal? item-path "")) ;; this is a toplevel test
+		    ;; what to do with toplevel? call rollup?
+		    (begin
+		      (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+		      (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+		    (begin
+		      (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
+					" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
+					" run-duration="run-duration)
+		      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))))
 	  stmth2
 	  run-id remotehoststart-deadtime) ;; default time 230 seconds
 	 
 	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
 	 ;;
@@ -3047,11 +3067,11 @@
 ;;
 ;; 1. cache tests-match-qry
 ;; 2. compile qry and store in hash
 ;; 3. convert for-each-row to fold
 ;;
-(define (db:get-tests-for-run-state-status dbstruct run-id testpatt)
+#;(define (db:get-tests-for-run-state-status dbstruct run-id testpatt)
   (db:with-db
    dbstruct run-id #f
    (lambda (db)
      (let* ((res            '())
 	    (stmt-cache      (dbr:dbstruct-stmt-cache dbstruct))
@@ -3467,11 +3487,11 @@
      (lambda (run-id)
        (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
 	 (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
      run-ids)))
 
-;; Get test data using test_id, run-id is not used
+;; Get test data using test_id, run-id is not used - but it will be!
 ;; 
 (define (db:get-test-info-by-id dbstruct run-id test-id)
   (db:with-db
    dbstruct
    #f ;; run-id

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -205,17 +205,19 @@
 			     (current-seconds) 
 			     start-seconds)))))
 	 (kill-tries 0))
     ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
     ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
-    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+    (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)
 
     (let loop ((minutes   (calc-minutes))
 	       (cpu-load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
 	       (disk-free (get-df (current-directory)))
                (last-sync (current-seconds)))
-      (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
+      ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
+      ;; top of loop encountered at "(current-seconds)" with
+      ;; last-sync="last-sync))
       (let* ((over-time     (> (current-seconds) (+ last-sync update-period)))
              (new-cpu-load  (let* ((load  (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
                                    (delta (abs (- load cpu-load))))
                               (if (> delta 0.1) ;; don't bother updating with small changes
                                   load
@@ -233,33 +235,28 @@
              (test-info   (rmt:get-test-info-by-id run-id test-id))
              (state       (db:test-get-state test-info))
              (status      (db:test-get-status test-info))
              (kill-reason  "no kill reason specified")
              (kill-job?    #f))
-        (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+        #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
         (cond
          ((test-get-kill-request run-id test-id)
           (set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
           (set! kill-job? #t))
          ((and runtlim (> (- (current-seconds) start-seconds) runtlim))
           (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
           (set! kill-job? #t))
          ((equal? status "DEAD")
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
           (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
           ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
           (set! kill-job? #f)))
 
         (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
         (launch:handle-zombie-tests run-id)
-        (when do-sync
-          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
-          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
-          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
-          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
-          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
-        
+        (if do-sync ;; save meta data about the running of this test
+	    (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
 	(if kill-job? 
 	    (begin
               (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
 	      (mutex-lock! m)
 	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
@@ -312,11 +309,11 @@
 	      (if (hash-table-ref/default misc-flags 'keep-going #f)  ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
 		  (loop (calc-minutes)
                         (or new-cpu-load cpu-load)
                         (or new-disk-free disk-free)
                         (if do-sync (current-seconds) last-sync)))))))
-    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+    (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional
 
 
 (define (launch:execute encoded-cmd)
   (let* ((cmdinfo    (common:read-encoded-string encoded-cmd))
 	 (tconfigreg #f))
@@ -465,11 +462,13 @@
 				  (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
 				  (exit))))
 		 (test-pid  (db:test-get-process_id  test-info)))
 	    (cond
              ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
-	     ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+	     ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+		  (and (equal? (db:test-get-state test-info) "COMPLETED")                           ;; completed/abort => rerun if asked
+		       (member (db:test-get-status test-info) '("ABORT"))))
 	      (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
 	      ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
 
               (rmt:general-call 'set-test-start-time #f test-id)
               (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
@@ -771,11 +770,13 @@
               (if (not (null? tal))
 				  (loop (car tal) (cdr tal)))))))))))        
  
 (define (launch:is-test-alive host pid)
   (if (and host pid (not (equal? host "n/a")))
-      (let* ((cmd (conc "ssh " host " pstree -A " pid))
+      (let* ((is-local (equal? host (get-host-name)))
+	     (ssh-cmd   (if is-local " " (conc "ssh " host " ")))
+	     (cmd (conc ssh-cmd "pstree -A " pid))
 	     (output (with-input-from-pipe cmd read-lines)))
 	(debug:print 2 *default-log-port* "Running " cmd " received " output)
 	(if (eq? (length output) 0)
 	   #f
 	   #t))
@@ -1306,12 +1307,12 @@
 	  (debug:print 2 *default-log-port* " - creating run area in " test-path)
 	  (handle-exceptions
 	   exn
 	   (begin
 	     (debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn)
-				", exiting, exn=" exn)
-	     (exit 1))
+				", continuing (might cause downstream issues?), exn=" exn)
+	     #f)
 	   (create-directory test-path #t))
 	  (debug:print 2 *default-log-port* 
 		       " - creating link from: " test-path "\n"
 		       "                   to: " lnktarget)
 

Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
 ;; Always use two or four digit decimal
 ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
 
 ;; (declare (unit megatest-version))
 
-(define megatest-version 1.6569)
+(define megatest-version 1.6572)

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -22,10 +22,12 @@
 
 (declare (unit rmt))
 (declare (uses api))
 (declare (uses http-transport))
 (include "common_records.scm")
+(include "db_records.scm")
+
 ;; (declare (uses rmtmod))
 
 ;; (import rmtmod)
 
 ;;
@@ -525,15 +527,24 @@
   (rmt:general-call 'register-test run-id run-id test-name item-path))
 
 (define (rmt:get-test-id run-id testname item-path)
   (rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
 
-;; run-id is NOT used
+;; run-id is NOT used - but it will be! 
 ;;
 (define (rmt:get-test-info-by-id run-id test-id)
   (if (number? test-id)
-      (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
+      (let* ((testdat  (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)))
+	     (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat")))
+	;; now we can update a couple fields from the filesystem
+	(if (and (db:test-get-rundir testdat)
+		 (file-exists? trundatf))
+	    (let* ((duration   (db:test-get-run_duration testdat))
+		   (event-time (db:test-get-event_time   testdat))
+		   (last-touch (file-modification-time trundatf)))
+	      (db:test-set-run_duration! testdat (max duration (- last-touch event-time)))))
+	testdat)
       (begin
 	(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
 	(print-call-chain (current-error-port))
 	#f)))
 

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -60,11 +60,15 @@
   )
 
 (defstruct runs:testdat
   hed tal reg reruns  test-record
   test-name item-path jobgroup
-  waitons testmode  newtal itemmaps prereqs-not-met)
+  waitons testmode  newtal
+  itemmaps
+  (prereqs-not-met #f)
+  (last-update 0) ;; 
+  )
   
 ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files
 ;;  - remove any that are over 3600 seconds old
 ;;  - if there are any that are younger than 10 seconds
 ;;      * sleep 10 seconds
@@ -828,10 +832,24 @@
 ;; reg - list of previously visited tests
 ;; tal - list of never visited tests
 ;;   prefer next hed to be from reg than tal.
 
 (define runs:nothing-left-in-queue-count 0)
+
+;; cache the result of get-prereqs-not-met and don't call it if called in past 10 seconds
+;; NOTE: This is assuming that testdat is highly specific to this test
+;;
+(define (runs:lazy-get-prereqs-not-met  testdat run-id waitons hed item-path #!key (mode '(normal))(itemmaps #f))
+  ;; mode: testmode itemmaps: itemmaps)
+  (if (and (runs:testdat-prereqs-not-met testdat)
+	   (< (- (current-seconds) (runs:testdat-last-update testdat)) 10)) ;;; only refresh for this test if
+					                                    ;;; it has been at least 10 seconds
+      (runs:testdat-prereqs-not-met testdat)  ;; return the cached result
+      (let* ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: mode itemmaps: itemmaps)))
+	(runs:testdat-prereqs-not-met-set! testdat res)
+	(runs:testdat-last-update-set! testdat (current-seconds))
+	res)))
 
 ;;======================================================================
 ;; runs:expand-items is called by runs:run-tests-queue
 ;;======================================================================
 ;;
@@ -838,22 +856,18 @@
 ;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature:
 ;;    (let loop ((hed         (car sorted-test-names))
 ;;	         (tal         (cdr sorted-test-names))
 ;;	         (reg         '()) ;; registered, put these at the head of tal 
 ;;	         (reruns      '()))
-(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)
+(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs
+			   run-id waitons item-path testmode test-record
+			   can-run-more items runname tconfig reglen test-registry
+			   test-records itemmaps testdat)
   (let* ((loop-list       (list hed tal reg reruns))
-	 (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
-			    (if (list? res)
-				res
-				(begin
-				  (debug:print 0 *default-log-port*
-					       "ERROR: rmt:get-prereqs-not-met returned non-list!\n"
-					       "  res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps)
-				  '()))))
-         (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait)))))
-	 ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
+	 (prereqs-not-met (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path
+							 mode: testmode itemmaps: itemmaps))
+	 (have-itemized   (not (null? (lset-intersection eq? testmode '(itemmatch itemwait)))))
 	 (fails           (runs:calc-fails prereqs-not-met))
 	 (prereq-fails    (runs:calc-prereq-fail prereqs-not-met))
 	 (non-completed   (runs:calc-not-completed prereqs-not-met))
 	 (runnables       (runs:calc-runnable prereqs-not-met))
          (unexpanded-prereqs
@@ -1715,11 +1729,13 @@
 				  (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
 						     (- remtries 1)))))))
 		       )))))
 
 	  ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
-	  (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
+	  (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path
+					 mode: testmode
+					 itemmaps: itemmaps)
 
 	  ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
 	  (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
 
 	  (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
@@ -1784,18 +1800,19 @@
 	 ;;    - but only do that if resources exist to kick off the job
 	 ;; EXPAND ITEMS
 	 ((or (procedure? items)(eq? items 'have-procedure))
           (debug:print-info 4 *default-log-port* "cond branch - "  "rtq-4")
 	  (let ((can-run-more    #f)) ;; (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
-	    (if (not can-run-more) #;(and (list? can-run-more)
+	    (if (not can-run-more) #;(and (list? can-run-more) ;; IDEA, this mechanism may have had some value, make it configurable to test pros/cons TODO
 		(car can-run-more))
-		(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here
+		(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs
+						    run-id waitons item-path testmode test-record
+						    can-run-more items runname tconfig reglen test-registry
+						    test-records itemmaps testdat)))
 		  (if loop-list
 		      (apply loop loop-list)
-                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
-                      )
-                  )
+                      (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)))
 		;; if can't run more just loop with next possible test
 		(loop (car newtal)(cdr newtal) reg reruns))))
          
 	 ;; this case should not happen, added to help catch any bugs
 	 ((and (list? items) itemdat)

Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1944,52 +1944,53 @@
 	 tdb
 	 "SELECT count(id) FROM test_rundat;")
 	res))
   0)
 
-(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)
-  (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))
-  (if (and cpuload diskfree)
-      (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
-  (if minutes 
-      (rmt:general-call 'update-run-duration run-id minutes test-id))
-  (if (and uname hostname)
-      (rmt:general-call 'update-uname-host run-id uname hostname test-id)))
+;; 
+(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f))
+  (if (get-environment-variable "MT_TEST_RUN_DIR")
+      (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data"))
+	     (or-dash  (lambda (instr)
+			 (cond
+			  ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken
+			  ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr))
+			  (else instr))))
+	     (file-new (not (directory-exists? dest-dir))))
+	(if file-new (create-directory dest-dir #t))
+	(let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append)))
+	  (with-output-to-port outp
+	    (lambda ()
+	      (if file-new
+		  (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname"))
+	      (print (current-seconds) "," (or-dash run-id)   "," (or-dash test-id)  ","
+		     (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree)  ","
+		     (or-dash minutes) "," (or-dash hostname) ","
+		     (or-dash uname)))) ;; put uname last as it has spaces in it
+	  (close-output-port outp)))
+      (begin
+	(rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))))
+  (if update-db
+      (begin
+	(if (and cpuload diskfree)
+	    (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
+	(if minutes 
+	    (rmt:general-call 'update-run-duration run-id minutes test-id))
+	(if (and uname hostname)
+	    (rmt:general-call 'update-uname-host run-id uname hostname test-id)))))
   
 ;; This one is for running with no db access (i.e. via rmt: internally)
-(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
+(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f))
 ;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
 ;;  (let ((remtries 10))
   (let* ((cpuload  (get-cpu-load))
 	 (diskfree (get-df (current-directory)))
+	 (tmpfree  (get-df "/tmp"))
 	 (uname    (get-uname "-srvpio"))
 	 (hostname (get-host-name)))
-    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
-    
-;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
-#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
-  (let* ((cpuload  (get-cpu-load))
-	 (diskfree (get-df (current-directory)))
-	 (remtries 10))
-    (handle-exceptions
-     exn
-     (if (> remtries 0)
-	 (begin
-	   (print-call-chain (current-error-port))
-	   (debug:print-info 0 *default-log-port* "WARNING: failed to set meta info. Will try " remtries " more times")
-	   (set! remtries (- remtries 1))
-	   (thread-sleep! 10)
-	   (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1)))
-	 (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
-	   (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up")
-	   (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.")
-	   (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
-	   (debug:print 5 *default-log-port* "exn=" (condition->list exn))
-	   (debug:print 0 *default-log-port* " status:  " ((condition-property-accessor 'sqlite3 'status) exn))
-	   (print-call-chain (current-error-port))))
-     (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)
-  )))
+    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree)))
+    
 	 
 ;;======================================================================
 ;; A R C H I V I N G
 ;;======================================================================