Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -244,11 +244,20 @@ (ttdat *server-info*) (server-state (tt-state ttdat)) (maxthreads 20) ;; make this a parameter? (status (cond ((> numthreads maxthreads) - 'busy) + (let* ((testsuite (common:get-testsuite-name)) + (mtexe (common:find-local-megatest)) + (proc (lambda () + ;; we are overloaded, try to start another server + (debug:print 0 *default-log-port* "Too many threads running, starting another server") + (tt:server-process-run *toppath* testsuite mtexe run-id)))) + (set! *server-start-requests* (cons proc *server-start-requests*))) + ;; 'busy + 'loaded ;; not ideal since the client will not backoff + ) (else 'ok))) (errmsg (case status ((busy) (conc "Server overloaded, "numthreads" threads in flight, current cmd: " cmd "\n current threads: " (api:get-threads))) ((loaded) (conc "Server loaded, "numthreads" threads in flight")) (else #f))) @@ -277,12 +286,10 @@ ;; (set! *api-process-request-count* (- *api-process-request-count* 1)) ;; (serialize payload) (api:unregister-thread (current-thread)) result))) - - (define *api-halt-writes* #f) (define (api:dispatch-request dbstruct cmd run-id params) (if (not *no-sync-db*) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -401,20 +401,18 @@ (not (equal? (common:get-last-run-version) (common:version-signature)))) ;; From 1.70 to 1.80, db's are compatible. - +;; +;; BUG: This logic is almost certainly not quite correct. +;; (define (common:api-changed?) - (let* ( - (megatest-major-version (substring (->string megatest-version) 0 4)) - (run-major-version (substring (conc (common:get-last-run-version)) 0 4)) - ) - (and (not (equal? megatest-major-version "1.80")) - (not (equal? megatest-major-version megatest-run-version))) - ) -) + (let* ((megatest-major-version (substring (->string megatest-version) 0 4)) + (run-major-version (substring (conc (common:get-last-run-version)) 0 4))) + (and (not (member megatest-major-version '("1.81" "1.80"))) + (not (equal? megatest-major-version run-major-version))))) ;;====================================================================== ;; Move me elsewhere ... ;; RADT => Why do we meed the version check here, this is called only if version misma ;; Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -208,11 +208,12 @@ ;; no conn (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)) (sdat (if (null? sdats) #f - (car sdats)))) + (let ((indx (max (random (- (length sdats) 1)) 0))) + (list-ref sdats indx))))) (debug:print-info 2 *default-log-port* "found sdat " sdat) (match sdat ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile) @@ -332,10 +333,13 @@ (thread-sleep! dly) (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))) ((loaded) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.") (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) + + ;; this would be a good place to force reconnection and connect to a different server + result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)) (else result))) (else ;; did not receive properly formated result (if (not res) ;; tt:send-receive telling us that communication failed @@ -382,11 +386,11 @@ (begin (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))))) ;; gets server info and appends path to server file -;; sorts by age, oldest first +;; sorts by age, --oldest-- now newest first ;; ;; returns list of (host port startseconds server-id servinfofile) ;; (define (tt:get-server-info-sorted ttdat dbfname) (let* ((areapath (tt-areapath ttdat)) @@ -395,11 +399,11 @@ (sorted (sort sdats (lambda (a b) (let* ((starta (list-ref a 2)) (startb (list-ref b 2))) (if (eq? starta startb) (string>? (list-ref a 3)(list-ref b 3)) ;; if servers started at same time look at server-id - (< starta startb)))))) + (> starta startb)))))) (count 0)) (for-each (lambda (rec) (if (or (> (length sorted) 1) (common:low-noise-print 120 "server info sorted")) @@ -518,10 +522,12 @@ ;; server ;;====================================================================== (define (tt:sync-dbs ttdat) #f) + +(define *server-start-requests* '()) ;; start the listener and start responding to requests ;; ;; NOTE: organise by dbfname, not run-id so we don't need ;; to pull in more modules @@ -573,20 +579,23 @@ (keep-srv (and good-ping same-host))) (if keep-srv (loop (cdr servrs) host (cons servdat result)) - (begin - ;; (debug:print-info 0 *default-log-port* "good-ping: " good-ping " same-host: " same-host "keep-srv: " keep-srv) - (handle-exceptions - exn - (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile", " - (condition->list exn)) - (delete-file* servinfofile)) - (loop (cdr servrs) prime-host result))))) + (let* ((modtime (file-modification-time servinfofile))) + ;; if the .servinfo hasn't been touched in five min + ;; we can be pretty sure the server is truly dead + (if (> (- (current-seconds) modtime) 360) + (handle-exceptions + exn + (debug:print-info 0 *default-log-port* + "Error removing server info file: "servinfofile", " + (condition->list exn)) + (delete-file* servinfofile)) + (loop (cdr servrs) prime-host result)))))) (else - ;; can't delete it as we don't have a filename. NOTE: Should really never get here. + ;; can't delete it as we don't have a filename. NOTE: Should never get here. (debug:print-info 0 *default-log-port* "ERROR: bad servinfo record \""servdat"\"") (loop (cdr servrs) prime-host result)) ;; drop ))))) (home-host (if (null? good-srvrs) #f @@ -650,10 +659,12 @@ (dbfile:insert-or-update-process nsdb *procinf*))) (debug:print 0 *default-log-port* "Exiting now.") (exit)))))) (define (tt:keep-running ttdat dbfname dbstruct) + + (thread-sleep! 1) ;; at this point the server is running and responding to calls, we just monitor ;; for db calls and exit if there are none. ;; if I am not in the first 3 servers, exit @@ -668,11 +679,11 @@ (tt-servinf-file ttdat))) servers)) (ok (cond ((not my-index) (debug:print 0 *default-log-port* "WARNING: Apparently I don't exist.") - 'not-yet) ;; keep trying ? + #f) ;; keep trying or give up? ((not *server-run*) (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.") #f) ((null? servers) (debug:print 0 *default-log-port* "WARNING: no servinfo files found, this cannot be.") @@ -701,11 +712,19 @@ (set! (file-modification-time sinfo-file) (current-seconds)) ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) - (begin + ;; process any requests to start a new server due to load on this one + (let* ((requests *server-start-requests*)) + (set! *server-start-requests* '()) + (if (> (length requests) 0) + (debug:print-info 0 *default-log-port* "Processing "(length requests)" server start requests")) + (for-each (lambda (proc) + (proc) + (thread-sleep! 1)) + requests) (thread-sleep! 5) (loop))))) (tt:shutdown-server ttdat) (debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running."))) @@ -814,10 +833,12 @@ (define (tt:too-recent-server-start dbfname) (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f))) (and last-run-time (< (- (current-seconds) last-run-time) 5)))) + +(define *last-server-start-request-time* 0) ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. @@ -826,20 +847,22 @@ (assert areapath "FATAL: tt:server-process-run called without areapath defined.") (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.") (assert mtexe "FATAL: tt:server-process-run called without mtexe defined.") ;; mtest -server - -m testsuite:ext-tests -db 6.db (let* ((dbfname (dbmod:run-id->dbfname run-id))) - (if (tt:too-recent-server-start dbfname) + (if (or (< (- (current-seconds) *last-server-start-request-time*) 5) ;; attempted start less than 5 sec ago + (tt:too-recent-server-start dbfname)) #f (let* ((load (get-normalized-cpu-load)) (srvrs (tt:find-server areapath dbfname)) (trying (length srvrs)) (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) + (set! *last-server-start-request-time* (current-seconds)) (cond - ((> load 2.0) - (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") - (thread-sleep! 1) + ((> load 4.0) + (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 4.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") + (thread-sleep! 1) ;; I'm not convinced that a delay here is helpful. -mrw- #f) ((> nrun 100) (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") (thread-sleep! 1) #f)