Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -429,8 +429,8 @@
 	  ;;          (boolean? res))
 	  ;;      res 
 	  ;;      (list "ERROR, not string, list, number or boolean" 1 cmd params res)))))
 	  (db:obj->string res transport: 'http)))
 	(begin
-	  (debug:print 0 *default-log-port*   "Server refused to process request. Sever id mismatch. recived " key " expected:  " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) 
+	  (debug:print 0 *default-log-port*   "Server refused to process request. Server id mismatch. recived " key " expected:  " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) 
 	  (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http)))))
 

Index: archive.scm
==================================================================
--- archive.scm
+++ archive.scm
@@ -346,11 +346,11 @@
         (rsync-exe (or (configf:lookup *configdat* "archive" "rsync") "rsync"))   
         (print-prefix      "Running: ") 
         (archive-info (archive:allocate-new-archive-block blockid-cache *toppath* tsname min-space target-patt run-patt "megatest-db"))
 	(archive-dir  (if archive-info (cdr archive-info) #f))
 	(archive-id   (if archive-info (car archive-info) -1))
-        (home-host (common:get-homehost))
+        (home-host    (server:choose-server *toppath* 'homehost))
         (archive-time (seconds->std-time-str (current-seconds)))
         (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
         (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
         (dbfile             (conc  archive-staging-db "/megatest.db"))) 
         (create-directory archive-staging-db #t)

Index: client.scm
==================================================================
--- client.scm
+++ client.scm
@@ -89,53 +89,52 @@
 	(exit 1))
       ;;
       ;; Alternatively here, we can get the list of candidate servers and work our way
       ;; through them searching for a good one.
       ;;
-      (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath))
+      (let* ((server-dat (server:choose-server areapath 'best))
 	     (runremote  (or area-dat *runremote*)))
 	(if (not server-dat) ;; no server found
 	    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
-	    (let ((host  (cadr  server-dat))
-		  (port  (caddr server-dat))
-                  (server-id (caddr (cddr server-dat))))
-	      (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
-	      (if (and (not area-dat)
-		       (not *runremote*))
-                  (begin       
-		    (set! *runremote* (make-remote))
-                    (let* ((server-info (remote-server-info *runremote*))) 
-                      (if server-info
-                        (begin
-                          (remote-server-url-set! *runremote* (server:record->url server-info))
-                          (remote-server-id-set! *runremote* (server:record->id server-info)))))))
-	      (if (and host port server-id)
-		  (let* ((start-res (case *transport-type*
-				      ((http)(http-transport:client-connect host port server-id))))
-			 (ping-res  (case *transport-type* 
-				      ((http)(rmt:login-no-auto-client-setup start-res)))))
-		    (if (and start-res
-			     ping-res)
-			(let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago
-			  (if runremote
-			      (begin
-				(remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res)
-				(debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
-				start-res)
-			      (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))
-			(begin    ;; login failed but have a server record, clean out the record and try again
-			  (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid.  Fixes part of Randy;s ticket 1405717332
-			  (case *transport-type* 
-			    ((http)(http-transport:close-connections)))
-                          (if *runremote* 
-			    (remote-conndat-set! runremote #f)  ;; (hash-table-delete! runremote run-id)
-                          )
-			  (thread-sleep! 1)
-			  (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
-			  )))
-		  (begin    ;; no server registered
-		    ;; (server:kind-run areapath)
-		    (server:start-and-wait areapath)
-		    (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
-		    (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
-		    (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))))))))
+	    (match server-dat
+	      ((host port start-time server-id)
+	       (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
+	       (if (and (not area-dat)
+			(not *runremote*))
+                   (begin       
+		     (set! *runremote* (make-remote))
+                     (let* ((server-info (remote-server-info *runremote*))) 
+                       (if server-info
+                           (begin
+                             (remote-server-url-set! *runremote* (server:record->url server-info))
+                             (remote-server-id-set! *runremote* (server:record->id server-info)))))))
+	       (if (and host port server-id)
+		   (let* ((start-res (http-transport:client-connect host port server-id))
+			  (ping-res  (rmt:login-no-auto-client-setup start-res)))
+		     (if (and start-res
+			      ping-res)
+			 (let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago
+			   (if runremote
+			       (begin
+				 (remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res)
+				 (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
+				 start-res)
+			       (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))))
+			 (begin    ;; login failed but have a server record, clean out the record and try again
+			   (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid.  Fixes part of Randy;s ticket 1405717332
+			   (case *transport-type* 
+			     ((http)(http-transport:close-connections)))
+                           (if *runremote* 
+			       (remote-conndat-set! runremote #f)  ;; (hash-table-delete! runremote run-id)
+                               )
+			   (thread-sleep! 1)
+			   (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1))
+			   )))
+		   (begin    ;; no server registered
+		     ;; (server:kind-run areapath)
+		     (server:start-and-wait areapath)
+		     (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
+		     (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries))))  ;; give server a little time to start up, randomize a little to avoid start storms.
+		     (client:setup-http-baby areapath remaining-tries: (- remaining-tries 1)))))
+	      (else
+	       (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat)))))))
 

Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -142,10 +142,11 @@
 (define *passnum*           0) ;; when running track calls to run-tests or similar
 ;; (define *alt-log-file* #f)  ;; used by -log
 ;; (define *common:denoise*    (make-hash-table)) ;; for low noise printing
 (define *default-log-port*  (current-error-port))
 (define *time-zero* (current-seconds)) ;; for the watchdog
+(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit
 (define *default-area-tag* "local")
 
 ;; DATABASE
 ;; (define *dbstruct-dbs*         #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this.
 ;; db stats
@@ -315,11 +316,12 @@
 
 (define (common:logpro-exit-code->test-status exit-code)
   (status-sym->string (common:logpro-exit-code->status-sym exit-code)))
 
 (defstruct remote
-  (hh-dat            (common:get-homehost)) ;; homehost record ( addr . hhflag )
+  (hh-dat            (or (server:choose-server *toppath* 'homehost)
+			 (cons #f #f)))
   (server-url        #f) ;; (server:check-if-running *toppath*) #f))
   (server-id         #f)
   (server-info       (if *toppath* (server:check-if-running *toppath*) #f))
   (last-server-check 0)  ;; last time we checked to see if the server was alive
   (connect-time      (current-seconds))
@@ -1305,72 +1307,10 @@
 ;;======================================================================
 ;; logic for getting homehost. Returns (host . at-home)
 ;; IF *toppath* is not set, wait up to five seconds trying every two seconds
 ;; (this is to accomodate the watchdog)
 ;;
-(define (common:get-homehost #!key (trynum 5))
-  ;; called often especially at start up. use mutex to eliminate collisions
-  (mutex-lock! *homehost-mutex*)
-  (cond
-   (*home-host*
-    (mutex-unlock! *homehost-mutex*)
-    *home-host*)
-   ((not *toppath*)
-    (mutex-unlock! *homehost-mutex*)
-    (launch:setup) ;; safely mutexed now
-    (if (> trynum 0)
-	(begin
-	  (thread-sleep! 2)
-	  (common:get-homehost trynum: (- trynum 1)))
-	#f))
-   (else
-    (let* ((currhost (get-host-name))
-	   (bestadrs (server:get-best-guess-address currhost))
-	   ;; first look in config, then look in file .homehost, create it if not found
-	   (homehost (or (configf:lookup *configdat* "server" "homehost" )
-			 (handle-exceptions
-			     exn
-			     (if (> trynum 0)
-				 (let ((delay-time (* (- 5 trynum) 5)))
-				   (mutex-unlock! *homehost-mutex*)
-				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying "
-						delay-time " seconds and trying again, message: "  ((condition-property-accessor 'exn 'message) exn)
-						", exn=" exn)
-				   (thread-sleep! delay-time)
-				   (common:get-homehost trynum: (- trynum 1)))
-				 (begin
-				   (mutex-unlock! *homehost-mutex*)
-				   (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)
-						"] Failed to read .homehost file after trying five times. Giving up and exiting, message: "
-						((condition-property-accessor 'exn 'message) exn))
-				   (exit 1)))
-			   (let ((hhf (conc *toppath* "/.homehost")))
-			     (if (common:file-exists? hhf)
-				 (with-input-from-file hhf read-line)
-				 (if (file-write-access? *toppath*)
-				     (begin
-				       (with-output-to-file hhf
-					 (lambda ()
-					   (print bestadrs)))
-				       (begin
-					 (mutex-unlock! *homehost-mutex*)
-					 (car (common:get-homehost))))
-				     #f))))))
-	   (at-home  (or (equal? homehost currhost)
-			 (equal? homehost bestadrs))))
-      (set! *home-host* (cons homehost at-home))
-      (mutex-unlock! *homehost-mutex*)
-      *home-host*))))
-
-;;======================================================================
-;; am I on the homehost?
-;;
-(define (common:on-homehost?)
-  (let ((hh (common:get-homehost)))
-    (if hh
-	(cdr hh)
-	#f)))
 
 ;;======================================================================
 ;; do we honor the caches of the config files?
 ;;
 (define (common:use-cache?)
@@ -2049,11 +1989,11 @@
 		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
 
 (define (common:wait-for-homehost-load maxnormload msg)
   (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                      #f
-                     (common:get-homehost)))
+                     (server:choose-server *toppath* 'homehost)))
          (hh     (if hh-dat (car hh-dat) #f)))
     (common:wait-for-normalized-load maxnormload msg hh)))
 
 (define (common:get-num-cpus remote-host)
   (let* ((actual-host (or remote-host (get-host-name))))
@@ -3344,11 +3284,11 @@
     pktsdirs))
 
 ;;======================================================================
 ;; use-lt is use linktree "lt" link to find pkts dir
 (define (common:save-pkt pktalist-in mtconf use-lt #!key (add-only #f)) ;; add-only saves the pkt only if there is a parent already
-  (if (or add-only
+  (if (or (not add-only)
 	  (hash-table-exists? *pkts-info* 'last-parent))
       (let* ((parent   (hash-table-ref/default *pkts-info* 'last-parent #f))
 	     (pktalist (if parent
 			   (cons `(parent . ,parent)
 				 pktalist-in)
@@ -3359,10 +3299,11 @@
 	  (let ((pktsdir (or (hash-table-ref/default *pkts-info* 'pkts-dir #f)
 			     (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt))
 				    (pktsdir   (car pktsdirs))) ;; assume it is there
 			       (hash-table-set! *pkts-info* 'pkts-dir pktsdir)
 			       pktsdir))))
+	    (debug:print 0 *default-log-port* "pktsdir: "pktsdir)
             (handle-exceptions
              exn
              (debug:print-info 0 "failed to write out packet to " pktsdir ", exn=" exn) ;; don't care if this failed for now but MUST FIX - BUG!!
              (if (not (file-exists? pktsdir))
                  (create-directory pktsdir #t))

Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -3809,13 +3809,13 @@
           (debug:print 0 *default-log-port* "Failed to find megatest.config, exiting") 
           (exit 1)
         )
     )
 
-    (if (not (common:on-homehost?))
+    #;(if (not (common:on-homehost?))
     (begin
-      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (common:get-homehost))
+      (debug:print 0 *default-log-port* "WARNING: You are starting the dashboard on a machine that is not the homehost:" (server:get-homehost))
       (debug:print 0 *default-log-port* "It will be slower.")
       ))
 
 
     (if (and (common:file-exists? mtdb-path)

Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -4586,22 +4586,28 @@
 ))
 
 
 (define (std-exit-procedure)
   ;;(common:telemetry-log-close)
-  (on-exit (lambda () 0))
+  (on-exit (lambda () 0)) ;; why is this here?
   ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
   (let ((no-hurry  (if *time-to-exit* ;; hurry up
 		       #f
 		       (begin
 			 (set! *time-to-exit* #t)
 			 #t))))
     (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
-    (if (and no-hurry (debug:debug-mode 18))
+    (if (and no-hurry
+	     (debug:debug-mode 18))
 	(rmt:print-db-stats))
     (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
                               (if *dbstruct-dbs* (db:close-all *dbstruct-dbs*)) ;; one second allocated
+			      (if (list? *on-exit-procs*)
+				  (for-each
+				   (lambda (proc)
+				     (proc))
+				   *on-exit-procs*))
 			      (if *task-db*    
 				  (let ((db (cdr *task-db*)))
 				    (if (sqlite3:database? db)
 					(begin
 					  (sqlite3:interrupt! db)

Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ http-transport.scm
@@ -426,13 +426,29 @@
                           (set! sdat *server-info*)
                           (mutex-unlock! *heartbeat-mutex*)
                           (if (and sdat
 				   (not changed)
 				   (> (- (current-seconds) start-time) 2))
-			      (begin
+			      (let* ((servinfodir (conc *toppath*"/.servinfo"))
+				     (ipaddr      (car sdat))
+				     (port        (cadr sdat))
+				     (servinf     (conc servinfodir"/"ipaddr":"port)))
+				(if (not (file-exists? servinfodir))
+				    (create-directory servinfodir #t))
+				(with-output-to-file servinf
+				  (lambda ()
+				    (let* ((serv-id (server:mk-signature)))
+				      (set! *server-id* serv-id)
+				      (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "serv-id)
+				      (print "started: "(seconds->year-week/day-time (current-seconds))))))
+				(set! *on-exit-procs* (cons
+						       (lambda ()
+							 (delete-file* servinf))
+						       *on-exit-procs*))
+				;; put data about this server into a simple flat file host.port
 				(debug:print-info 0 *default-log-port* "Received server alive signature")
-                                (common:save-pkt `((action . alive)
+                                #;(common:save-pkt `((action . alive)
                                                    (T      . server)
                                                    (pid    . ,(current-process-id))
                                                    (ipaddr . ,(car sdat))
                                                    (port   . ,(cadr sdat)))
                                                  *configdat* #t)
@@ -439,13 +455,16 @@
 				sdat)
                               (begin
 				(debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
                                 (sleep 4)
 				(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
-				    (begin
+				    (let* ((ipaddr  (car sdat))
+					   (port    (cadr sdat))
+					   (servinf (conc *toppath*"/.servinfo/"ipaddr":"port)))
 				      (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
-                                      (common:save-pkt `((action . died)
+				      ;; (delete-file* servinf) ;; handled by on-exit, can be removed
+                                      #;(common:save-pkt `((action . died)
                                                          (T      . server)
                                                          (pid    . ,(current-process-id))
                                                          (ipaddr . ,(car sdat))
                                                          (port   . ,(cadr sdat))
                                                          (msg    . "Transport died?"))
@@ -518,11 +537,11 @@
 		(new-port  (cadr sdat)))
 	    (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info")
 	    (set! iface new-iface)
 	    (set! port  new-port)
              (if (not *server-id*)
-              (set! *server-id* (server:mk-signature)))
+		 (set! *server-id* (server:mk-signature)))
 	    (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
 	    (flush-output *default-log-port*)))
       
       ;; Transfer *db-last-access* to last-access to use in checking that we are still alive
       (mutex-lock! *heartbeat-mutex*)
@@ -600,14 +619,17 @@
     ;; 			  (/ *total-non-write-delay* 
     ;; 			     *number-non-write-queries*))
     ;; 		      " ms")
     
     (db:print-current-query-stats)
-    (common:save-pkt `((action . exit)
+    #;(common:save-pkt `((action . exit)
                        (T      . server)
                        (pid    . ,(current-process-id)))
-                     *configdat* #t)
+    *configdat* #t)
+
+    ;; remove .servinfo file(s) here
+    
     (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
     (exit)))
 
 ;; all routes though here end in exit ...
 ;;
@@ -640,14 +662,14 @@
     #;(let* ((num-alive   (server:get-num-alive (server:get-list *toppath*))))
       (if (> num-alive 3)
           (begin
             (cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
             (exit))))
-  (common:save-pkt `((action . start)
-		     (T      . server)
-		     (pid    . ,(current-process-id)))
-		   *configdat* #t)
+    #;(common:save-pkt `((action . start)
+		       (T      . server)
+		       (pid    . ,(current-process-id)))
+		     *configdat* #t)
     (let* ((th2 (make-thread (lambda ()
                                (debug:print-info 0 *default-log-port* "Server run thread started")
                                (http-transport:run 
                                 (if (args:get-arg "-server")
                                     (args:get-arg "-server")

Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -1563,11 +1563,11 @@
 		       (with-output-to-string
 			 (lambda () ;; (list 'hosts     hosts)
 			   (write (list (list 'testpath  test-path)
 					;; (list 'transport (conc *transport-type*))
 					;; (list 'serverinf *server-info*)
-					(list 'homehost  (let* ((hhdat (common:get-homehost)))
+					#;(list 'homehost  (let* ((hhdat (server:get-homehost)))
 							   (if hhdat
 							       (car hhdat)
 							       #f)))
 					(list 'serverurl (if *runremote*
 							     (remote-server-url *runremote*)

Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -656,13 +656,13 @@
 (if (args:any? "-run" "-runall" "-remove-runs" "-set-state-status" "-kill-runs" "-kill-rerun")
     (debug:print 0 *default-log-port* (string-intersperse (argv) " ")))
 
 ;; some switches imply homehost. Exit here if not on homehost
 ;;
-(let ((homehost-required  (list "-cleanup-db" "-server")))
+(let ((homehost-required  (list "-cleanup-db")))
   (if (apply args:any? homehost-required)
-      (if (not (common:on-homehost?))
+      (if (not (server:choose-server *toppath* 'home?))
 	  (for-each
 	   (lambda (switch)
 	     (if (args:get-arg switch)
 		 (begin
 		   (debug:print 0 *default-log-port* "ERROR: you must be on the homehost to run with " switch
@@ -2379,11 +2379,11 @@
 (if (or (getenv "MT_RUNSCRIPT")
 	(args:get-arg "-repl")
 	(args:get-arg "-load"))
     (let* ((toppath (launch:setup))
 	   (dbstructs (if (and toppath
-                               (common:on-homehost?))
+                               (server:choose-server toppath 'home?))
                           (db:setup #t)
                           #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
       (if *toppath*
 	  (cond
 	   ((getenv "MT_RUNSCRIPT")

Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -116,11 +116,12 @@
     ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
     ;; DOT SET_HOMEHOST -> MUTEXLOCK;
     ;; ensure we have a homehost record
     (if (not (pair? (remote-hh-dat runremote)))  ;; not on homehost
 	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
-	(remote-hh-dat-set! runremote (common:get-homehost)))
+	(let ((hh-data (server:choose-server areapath 'homehost)))
+	  (remote-hh-dat-set! runremote (or hh-data (cons #f #f)))))
     
     ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
     (cond
      #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds
       (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.")

Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -99,10 +99,22 @@
 			 (with-output-to-string
 			   (lambda ()
 			     (write (list (current-directory)
                                           (current-process-id)
 					  (argv)))))))
+
+(define (server:get-client-signature)
+  (if *my-client-signature* *my-client-signature*
+      (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic
+        (set! *my-client-signature* sig)
+        *my-client-signature*)))
+
+(define (server:get-server-id)
+  (if *server-id* *server-id*
+      (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic
+        (set! *server-id* sig)
+        *server-id*)))
 
 ;; When using zmq this would send the message back (two step process)
 ;; with spiffy or rpc this simply returns the return data to be returned
 ;; 
 (define (server:reply return-addr query-sig success/fail result)
@@ -121,25 +133,26 @@
 ;; if the target-host is set 
 ;; try running on that host
 ;;   incidental: rotate logs in logs/ dir.
 ;;
 (define  (server:run areapath) ;; areapath is *toppath* for a given testsuite area
-  (let* ((curr-host   (get-host-name))
+  (let* (;; (curr-host   (get-host-name))
          ;; (attempt-in-progress (server:start-attempted? areapath))
          ;; (dot-server-url (server:check-if-running areapath))
-	 (curr-ip     (server:get-best-guess-address curr-host))
-	 (curr-pid    (current-process-id))
-	 (homehost    (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
-	 (target-host (car homehost))
+	 ;; (curr-ip     (server:get-best-guess-address curr-host))
+	 ;; (curr-pid    (current-process-id))
+	 ;; (homehost    (server:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
+	 ;; (target-host (car homehost))
 	 (testsuite   (common:get-testsuite-name))
 	 (logfile     (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
 	 (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
 			   ""))
 	 (cmdln (conc (common:get-megatest-exe)
-		      " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
-							   " -daemonize "
-							   "")
+		      " -server - ";; (or target-host "-")
+		      (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
+			  " -daemonize "
+			  "")
 		      ;; " -log " logfile
 		      " -m testsuite:" testsuite
 		      " " profile-mode
 		      )) ;; (conc " >> " logfile " 2>&1 &")))))
 	 (log-rotate  (make-thread common:rotate-logs  "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
@@ -148,25 +161,25 @@
     (push-directory areapath)
     (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
     (thread-start! log-rotate)
     
     ;; host.domain.tld match host?
-    (if (and target-host 
-	     ;; look at target host, is it host.domain.tld or ip address and does it 
-	     ;; match current ip or hostname
-	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
-	     (not (equal? curr-ip target-host)))
-	(begin
-	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
-	  (setenv "TARGETHOST" target-host)))
-      
+    ;; (if (and target-host 
+    ;; 	     ;; look at target host, is it host.domain.tld or ip address and does it 
+    ;; 	     ;; match current ip or hostname
+    ;; 	     (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
+    ;; 	     (not (equal? curr-ip target-host)))
+    ;; 	(begin
+    ;; 	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
+    ;; 	  (setenv "TARGETHOST" target-host)))
+    ;;   
     (setenv "TARGETHOST_LOGF" logfile)
     (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
     (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
     (system (conc "nbfake " cmdln))
     (unsetenv "TARGETHOST_LOGF")
-    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
+    ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
     (thread-join! log-rotate)
     (pop-directory)))
 
 ;; given a path to a server log return: host port startseconds server-id
 ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let 
@@ -271,11 +284,11 @@
 			       (> (length new-res) limit))
 			  new-res ;; (take new-res limit)  <= need intelligent sorting before this will work
 			  new-res)
 		      (loop (string-chomp (car tal)) (cdr tal) new-res)))))))))
 
-(define (server:get-num-alive srvlst)
+#;(define (server:get-num-alive srvlst)
   (let ((num-alive 0))
     (for-each
      (lambda (server)
         (handle-exceptions
           exn
@@ -326,33 +339,35 @@
 		     (list-ref b 3))))))
     (if (> (length slst) nums)
 	(take slst nums)
 	slst)))
 
-(define (server:get-first-best areapath)
-  (let ((srvrs (server:get-best (server:get-list areapath))))
-    (if (and srvrs
-	     (not (null? srvrs)))
-	(car srvrs)
-	#f)))
-
-(define (server:get-rand-best areapath)
-  (let ((srvrs (server:get-best (server:get-list areapath))))
-    (if (and (list? srvrs)
-	     (not (null? srvrs)))
-	(let* ((len (length srvrs))
-	       (idx (random len)))
-	  (list-ref srvrs idx))
-	#f)))
+;; ;; switch from server:get-list to server:get-servers-info
+;; ;;
+;; (define (server:get-first-best areapath)
+;;   (let ((srvrs (server:get-best (server:get-list areapath))))
+;;     (if (and srvrs
+;; 	     (not (null? srvrs)))
+;; 	(car srvrs)
+;; 	#f)))
+;; 
+;; (define (server:get-rand-best areapath)
+;;   (let ((srvrs (server:get-best (server:get-list areapath))))
+;;     (if (and (list? srvrs)
+;; 	     (not (null? srvrs)))
+;; 	(let* ((len (length srvrs))
+;; 	       (idx (random len)))
+;; 	  (list-ref srvrs idx))
+;; 	#f)))
 
 (define (server:record->id servr)
   (handle-exceptions
    exn
    (begin 
      (debug:print-info 0 *default-log-port*  "Unable to get server id from " servr ", exn=" exn)     
    #f)
-  (match-let (((mod-time host port start-time server-id pid)
+  (match-let (((host port start-time server-id)
 	       servr))
     (if server-id
 	server-id
 	#f))))
 
@@ -360,28 +375,22 @@
   (handle-exceptions
    exn
    (begin 
      (debug:print-info 0 *default-log-port*  "Unable to get server url from " servr ", exn=" exn)     
    #f)
-  (match-let (((mod-time host port start-time server-id pid)
+  (match-let (((host port start-time server-id)
 	       servr))
     (if (and host port)
 	(conc host ":" port)
 	#f))))
 
-(define (server:get-client-signature) ;; BB> why is this proc named "get-"?  it returns nothing -- set! has not return value.
-  (if *my-client-signature* *my-client-signature*
-      (let ((sig (server:mk-signature)))
-        (set! *my-client-signature* sig)
-        *my-client-signature*)))
-
 
 ;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
 ;; if it is old enough, overwrite it and wait 0.25 seconds.
 ;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
 ;;
-(define (server:wait-for-server-start-last-flag areapath)
+#;(define (server:wait-for-server-start-last-flag areapath)
   (let* ((start-flag (conc areapath "/logs/server-start-last"))
 	 ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
 	 (idletime    (configf:lookup-number *configdat* "server" "idletime" default: 4))
 	 (server-key (conc (get-host-name) "-" (current-process-id))))
     (if (file-exists? start-flag)
@@ -405,20 +414,99 @@
 				   (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
 		 
 		 (thread-sleep! ( + 1 idletime))
 		 (server:wait-for-server-start-last-flag areapath)))))))
 
+;; oldest server alive determines host then choose random of youngest
+;; five servers on that host
+;;
+(define (server:get-servers-info areapath)
+  (let* ((servinfodir (conc *toppath*"/.servinfo"))
+	 (allfiles    (glob (conc servinfodir"/*")))
+	 (res         (make-hash-table)))
+    (for-each
+     (lambda (f)
+       (let* ((hostport  (pathname-strip-directory f))
+	      (serverdat (server:logf-get-start-info f)))
+	 (hash-table-set! res hostport serverdat)))
+     allfiles)
+    res))
+
+;; oldest server alive determines host then choose random of youngest
+;; five servers on that host
+;;
+;; mode:
+;;   best - get best server (random of newest five)
+;;   home - get home host based on oldest server
+;;   info - print info
+(define (server:choose-server areapath #!optional (mode 'best))
+  ;; age is current-starttime
+  ;; find oldest alive
+  ;;   1. sort by age ascending and ping until good
+  ;; find alive rand from youngest
+  ;;   1. sort by age descending
+  ;;   2. take five
+  ;;   3. check alive, discard if not and repeat
+  (let* ((serversdat  (server:get-servers-info areapath))
+	 (servkeys    (hash-table-keys serversdat))
+	 (by-time-asc (if (not (null? servkeys))
+			  (sort servkeys ;; list of "host:port"
+				(lambda (a b)
+				  (>= (list-ref (hash-table-ref serversdat a) 2)
+				      (list-ref (hash-table-ref serversdat b) 2))))
+			  '())))
+    (if (not (null? by-time-asc))
+	(let* ((oldest     (last by-time-asc))
+	       (oldest-dat (hash-table-ref serversdat oldest))
+	       (host       (list-ref oldest-dat 0))
+	       (all-valid  (filter (lambda (x)
+				     (equal? host (list-ref (hash-table-ref serversdat x) 0)))
+				   by-time-asc))
+	       (best-five  (lambda ()
+			     (if (> (length all-valid) 5)
+				 (map (lambda (x)
+					(hash-table-ref serversdat x))
+				      (take all-valid 5))
+				 all-valid)))
+	       (names->dats (lambda (names)
+			      (map (lambda (x)(hash-table-ref serversdat x)) names)))
+	       (am-home?    (lambda ()
+			      (let* ((currhost (get-host-name))
+				     (bestadrs (server:get-best-guess-address currhost)))
+				(or (equal? host currhost)
+				    (equal? host bestadrs))))))
+	  (case mode
+	    ((info)
+	     (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
+	     (print "youngest: "(hash-table-ref serversdat (car all-valid))))
+	    ((home)     host)
+	    ((homehost) (cons host (am-home?))) ;; shut up old code
+	    ((home?)    (am-home?))
+	    ((best-five)(names->dats (best-five)))
+	    ((all-valid)(names->dats all-valid))
+	    ((best)     (let* ((best-five (best-five))
+			       (len       (length best-five)))
+			  (list-ref best-five (random len))))
+			  
+	    (else
+	     (debug:print 0 *default-log-port* "ERROR: invalid command "mode)
+	     #f)))
+	(begin
+	  (server:run areapath)
+	  (thread-sleep! 3)
+	  #f))))
 
-        
+	  
 ;; kind start up of server, wait before allowing another server for a given
 ;; area to be launched
 ;;
 (define (server:kind-run areapath)
   ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
   ;; and wait for it to be at least <server idletime> seconds old
-  (server:wait-for-server-start-last-flag areapath)
-  (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
+  ;; (server:wait-for-server-start-last-flag areapath)
+  (server:run areapath)
+  #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
       (let* ((lock-file    (conc areapath "/logs/server-start.lock")))
 	(let* ((start-flag (conc areapath "/logs/server-start-last")))
 	  (common:simple-file-lock-and-wait lock-file expire-time: 25)
 	  (debug:print-info  2 *default-log-port* "server:kind-run: touching " start-flag)
 	  (system (conc "touch " start-flag)) ;; lazy but safe
@@ -434,35 +522,33 @@
     (let loop ((server-info (server:check-if-running areapath))
 	       (try-num    0))
       (if (or server-info
 	      (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
 	  (server:record->url server-info)
-	  (let ((num-ok (length (server:get-best (server:get-list areapath)))))
+	  (let ((num-ok (length (server:choose-server areapath 'all-valid))))
 	    (if (and (> try-num 0)  ;; first time through simply wait a little while then try again
 		     (< num-ok 1))  ;; if there are no decent candidates for servers then try starting a new one
-		(server:kind-run areapath))
+		(server:run areapath))
 	    (thread-sleep! 5)
 	    (loop (server:check-if-running areapath)
 		  (+ try-num 1)))))))
 
-(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.
-
 (define (server:get-num-servers #!key (numservers 2))
   (let ((ns (string->number
 	     (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
     (or ns numservers)))
 
 ;; no longer care if multiple servers are started by accident. older servers will drop off in time.
 ;;
 (define (server:check-if-running areapath) ;;  #!key (numservers "2"))
   (let* ((ns            (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
-	 (servers       (server:get-best (server:get-list areapath))))
+	 (servers       (server:choose-server areapath 'best-five))) ;; (server:get-best (server:get-list areapath))))
     (if (or (and servers
 		 (null? servers))
-	    (not servers)
-	    (and (list? servers)
-		 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
+	    (not servers))
+	    ;; (and (list? servers)
+	    ;;	 (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
         #f
         (let loop ((hed (car servers))
                    (tal (cdr servers)))
           (let ((res (server:check-server hed)))
             (if res
@@ -473,15 +559,12 @@
 
 ;; ping the given server
 ;;
 (define (server:check-server server-record)
   (let* ((server-url (server:record->url server-record))
-         (server-id (server:record->id server-record)) 
-         (res        (case *transport-type*
-                       ((http)(server:ping server-url server-id))
-                       ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
-                       )))
+         (server-id  (server:record->id server-record)) 
+         (res        (server:ping server-url server-id)))
     (if res
         server-url
 	#f)))
 
 (define (server:kill servr)