Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -126,11 +126,11 @@
    (cond
     ((not (vector? dat))                    ;; it is an error to not receive a vector
      (vector #f (vector #f "remote must be called with a vector")))
     ((> *api-process-request-count* 20) ;; 20)
      (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
-     (vector #f (vector 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
+     (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
     (else  
      (let* ((cmd-in            (vector-ref dat 0))
             (cmd               (if (symbol? cmd-in)
 				   cmd-in
 				   (string->symbol cmd-in)))

Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ http-transport.scm
@@ -225,19 +225,21 @@
        ;; process and return it.
        (let* ((send-recieve (lambda ()
 			      (mutex-lock! *http-mutex*)
 			      ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines)
 			      ;;					       ((exn http client-error) e (print e)))
-			      (set! res (vector
+			      (set! res (vector                ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time.
 					 success
 					 (db:string->obj 
 					  (handle-exceptions
 					      exn
-					      (begin
+					      (let ((call-chain (get-call-chain))
+						    (msg        ((condition-property-accessor 'exn 'message) exn)))
 						(set! success #f)
 						(debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".")
-						(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
+						(debug:print 0 *default-log-port* " message: " msg)
+						(debug:print 0 *default-log-port* " cmd: " cmd " params: " params)
 						(if runremote
 						    (remote-conndat-set! runremote #f))
 						;; Killing associated server to allow clean retry.")
 						;; (tasks:kill-server-run-id run-id)  ;; better to kill the server in the logic that called this routine?
 						(mutex-unlock! *http-mutex*)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -796,11 +796,11 @@
 		       (server:kill server)))))
 	     (sort servers (lambda (a b)
 			     (let ((ma (or (any->number (car a)) 9e9))
 				   (mb (or (any->number (car b)) 9e9)))
 			       (> ma mb)))))
-	    (debug:print-info 1 *default-log-port* "Done with listservers")
+	    ;; (debug:print-info 1 *default-log-port* "Done with listservers")
 	    (set! *didsomething* #t)
 	    (exit))
 	  (exit))))
       ;; must do, would have to add checks to many/all calls below
 

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -196,14 +196,18 @@
 	(if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time
 	;; (mutex-unlock! *rmt-mutex*)
         (debug:print-info 13 *default-log-port* "rmt:send-receive, case  9. conninfo=" conninfo " dat=" dat " runremote = " runremote)
 	(mutex-unlock! *rmt-mutex*)
 	(if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end
-	    (if (and (symbol? res)
-		     (eq? res 'overloaded))
+	    (if (and (vector? res)
+		     (eq? (vector-length res) 2)
+		     (eq? (vector-ref res 2) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision.
 		(let ((wait-delay (+ attemptnum (* attemptnum 10))))
 		  (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
+		  (mutex-lock! *rmt-mutex*)
+		  (set! *runremote* #f) ;; force starting over
+		  (mutex-unlock! *rmt-mutex*)
 		  (thread-sleep! wait-delay)
 		  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
 		res) ;; All good, return res
 	    (begin
 	      (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)

Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -253,10 +253,14 @@
 		       (thread-start! th1)
 		       (thread-join! th2)))))
       (set-signal-handler! signal/int sighand)
       (set-signal-handler! signal/term sighand))
 
+    ;; force the starting of a server
+    (debug:print 0 *default-log-port* "waiting on server...")
+    (server:start-and-wait *toppath*)
+    
     (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process
     (set! runconf (if (file-exists? runconfigf)
 		      (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target)
 		      (begin
 			(debug:print 0 *default-log-port* "WARNING: You do not have a run config file: " runconfigf)
@@ -1176,14 +1180,15 @@
 			   newtal:      newtal
 			   itemmaps:    itemmaps
 			   ;; prereqs-not-met: prereqs-not-met
 			   )))
 	(runs:dat-regfull-set! runsdat regfull)
-	;; every couple minutes verify the server is there for this run
-	;; (if (and (common:low-noise-print 60 "try start server"  run-id)
-	;; 	 (tasks:need-server run-id))
-	;;     (tasks:start-and-wait-for-server tdbdat run-id 10)) ;; NOTE: delay and wait is done under the hood
+
+	;; every 15 minutes verify the server is there for this run
+	(if (and (common:low-noise-print 240 "try start server"  run-id)
+		 (not (server:check-if-running *toppath*)))
+	    (server:kind-run *toppath*))
 	
 	(if (> num-running 0)
 	  (set! last-time-some-running (current-seconds)))
 
       (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -218,31 +218,35 @@
 ;;
 ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off
 ;; and servers should stick around for about two hours or so.
 ;;
 (define (server:get-best srvlst)
-  (let ((now (current-seconds)))
-    (sort
-     (filter (lambda (rec)
-	       (if (and (list? rec)
-			(> (length rec) 2))
-		   (let ((start-time (list-ref rec 3))
-			 (mod-time   (list-ref rec 0)))
-		     ;; (print "start-time: " start-time " mod-time: " mod-time)
-		     (and start-time mod-time
-			  (> (- now start-time) 0)    ;; been running at least 0 seconds
-			  (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
-			  (< (- now start-time) 
-			     (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
-				   180)
-				(random 360))) ;; under one hour running time +/- 180
-			  ))
-		   #f))
-	     srvlst)
-     (lambda (a b)
-       (< (list-ref a 3)
-	  (list-ref b 3))))))
+  (let* ((nums (server:get-num-servers))
+	 (now  (current-seconds))
+	 (slst (sort
+		(filter (lambda (rec)
+			  (if (and (list? rec)
+				   (> (length rec) 2))
+			      (let ((start-time (list-ref rec 3))
+				    (mod-time   (list-ref rec 0)))
+				;; (print "start-time: " start-time " mod-time: " mod-time)
+				(and start-time mod-time
+				     (> (- now start-time) 0)    ;; been running at least 0 seconds
+				     (< (- now mod-time)   16)   ;; still alive - file touched in last 16 seconds
+				     (< (- now start-time) 
+					(+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
+					      180)
+					   (random 360))) ;; under one hour running time +/- 180
+				     ))
+			      #f))
+			srvlst)
+		(lambda (a b)
+		  (< (list-ref a 3)
+		     (list-ref b 3))))))
+    (if (> (length slst) nums)
+	(take slst nums)
+	slst)))
 
 (define (server:get-first-best areapath)
   (let ((srvrs (server:get-best (server:get-list areapath))))
     (if (and srvrs
 	     (not (null? srvrs)))
@@ -305,16 +309,20 @@
 		(server:kind-run areapath))
 	    (thread-sleep! 5)
 	    (loop (server:check-if-running areapath)))))))
 
 (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.
+
+(define (server:get-num-servers #!key (numservers 2))
+  (let ((ns (string->number
+	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
+    (or ns numservers)))
 
 ;; no longer care if multiple servers are started by accident. older servers will drop off in time.
 ;;
-(define (server:check-if-running areapath #!key (numservers "2"))
-  (let* ((ns            (string->number
-			 (or (configf:lookup *configdat* "server" "numservers") numservers)))
+(define (server:check-if-running areapath) ;;  #!key (numservers "2"))
+  (let* ((ns            (server:get-num-servers))
 	 (servers       (server:get-best (server:get-list areapath))))
     ;; (print "servers: " servers " ns: " ns)
     (if (or (and servers
 		 (null? servers))
 	    (not servers)