Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -98,11 +98,23 @@ ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname) (rmt:start-server ;; tt:server-process-run areapath testsuite ;; (dbfile:testsuite-name) mtexe - run-id))))) + run-id)))) + ;; current method does not take advantage of simply getting the list of + ;; servers from no-sync db. srv-get-proc would be a first step but is not used yet + (srv-get-proc (lambda () + (let* ((candidates (rmt:get-process-options "server" dbfname)) + (ccount (length candidates))) + (case ccount + ((0) #f) ;; need to call rmt:start-server + ((1) (car candidates)) + (else + (if (> (random 100) 50) + (car candidates) + (cadr candidates)))))))) ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it ;; and if there is no conn we first send a request to the main.db server to start a ;; server for the dbfname. #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request (begin Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -151,14 +151,15 @@ run-id))))) (if conn (begin (debug:print-info 2 *default-log-port* "already connected to a server") conn) ;; we are already connected to the server - (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)) - (sdat (if (null? sdats) - #f - (car sdats)))) + (let* (;; (sdats (tt:get-server-info-sorted ttdat dbfname)) + ;; (sdat (if (null? sdats) + ;; #f + ;; (car sdats)))) + (sdat (tt:get-valid-server-random ttdat dbfname))) (match sdat ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile) (let* ((host-port (conc host":"port)) @@ -349,11 +350,30 @@ (common:low-noise-print 120 "server info sorted")) (debug:print 2 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", "))) (set! count (+ count 1))) sorted) sorted)) - + +(define (tt:get-valid-server-random ttdat dbfname) + (let* ((candidates (tt:get-server-info-sorted ttdat dbfname)) + (numc (length candidates))) + (case numc + ((0) #f) + ((1) (car candidates)) + (else + (let* ((firsthost (caar candidates)) + (valid-candidates (filter (lambda (x)(equal? (car x) firsthost)) candidates)) + (numvalid (length valid-candidates))) + (case numvalid + ((0) (debug:print 0 *default-log-port* "ERROR: code issue, filter broke?") #f) + ((1) (car valid-candidates)) + (else + ;; expand logic here to support more than two servers + (if (> (random 100) 50) + (car valid-candidates) + (cadr valid-candidates))))))))) + (define (tt:send-receive ttdat conn cmd run-id params) (let* ((host-port (tt-conn-host-port conn)) ;; (conc (tt-conn-host conn)":"(tt-conn-port conn))) (host (tt-conn-host conn)) (port (tt-conn-port conn)) (dat (list cmd run-id params #f))) ;; no meta data yet @@ -502,12 +522,16 @@ (prime-host #f) (result '())) (if (null? servrs) (reverse result) (let* ((servdat (car servrs))) + ;;; INFO: (0) 23:08:10 ERROR: bad servinfo record + ;;; "(127.0.1.1 36797 1701662813.0 88fff570fa3996d6082df8a1875e6cb1 15462 6.db /home/matt/data/megatest/ext-tests/sixtyfivek/.servinfo/127.0.1.1:36797-15462:6.db)" + (match servdat - ((host port startseconds server-id servinfofile) + ;; host port startt server-id pid dbfname servinffilr + ((host port startseconds server-id pid dbfname servinfofile) (let* ((ping-res (tt:timed-ping host port server-id)) (good-ping (match ping-res ((result . ping-time) (not result)) ;; we couldn't reach the server or it was not a megatest server (else #f))) ;; the ping failed completely?