Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -324,11 +324,11 @@ ;;=============================================== ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl ;; SERVERS - ((start-server) (apply server:kind-run params)) + ((start-server) (apply tt:server-process-run params)) ((kill-server) (set! *server-run* #f)) ;; TESTS ;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -182,11 +182,10 @@ ;; (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *server-id* #f) (define *server-info* #f) ;; good candidate for easily convert to non-global (define *time-to-exit* #f) -(define *server-run* #t) (define *run-id* #f) (define *server-kind-run* (make-hash-table)) (define *home-host* #f) ;; (define *total-non-write-delay* 0) (define *heartbeat-mutex* (make-mutex)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -70,31 +70,47 @@ (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; how to make area-dat (define (rmt:set-ttdat areapath ttdat) - (if (not ttdat) + (if ttdat + ttdat (let* ((newremote (make-and-init-remote areapath))) (set! *ttdat* newremote) - ttdat))) + newremote))) ;; NB// area-dat replaced by ttdat ;; (define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f)) (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") - (assert ttdat "FATAL: rmt:send-receive must receive initialized area-dat") (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas (attemptnum (+ 1 attemptnum)) (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*)) (testsuite (common:get-testsuite-name)) (mtexe (common:find-local-megatest)) - (dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) - (rmt:set-ttdat areapath ttdat) - (tt:handler (rmt:set-ttdat ttdat) cmd run-id params - attemptnum readonly-mode dbfname - testsuite mtexe))) + (dbfname (conc (dbfile:run-id->dbnum run-id)".db")) + (ttdat (rmt:set-ttdat areapath ttdat)) + (conn (tt:get-conn ttdat dbfname)) + (is-main (equal? dbfname "main.db")) ;; why not (not run-id) ? + (server-start-proc (if is-main + #f + (lambda () + ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname) + (rmt:start-server ;; tt:server-process-run + areapath + testsuite ;; (dbfile:testsuite-name) + mtexe + run-id))))) + ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it + ;; and if there is no conn we first send a request to the main.db server to start a + ;; server for the dbfname. + #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request + (begin + (server-start-proc) + (thread-sleep! 1))) + (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))) ;; KEEP THIS HERE FOR A WHILE, WE MAY WANT TO RESURECT IT ;; (define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) ;; (let* ((keys (common:get-fields *configdat*)) ;; (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard"))) @@ -191,12 +207,12 @@ ;;====================================================================== (define (rmt:kill-server run-id) (rmt:send-receive 'kill-server run-id (list run-id))) -(define (rmt:start-server run-id) - (rmt:send-receive 'start-server 0 (list run-id))) +(define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server + (rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id))) ;;====================================================================== ;; M I S C ;;====================================================================== @@ -205,16 +221,16 @@ ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; ;; (define (rmt:login-no-auto-client-setup runremote) -;; (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature)))) +;; (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature)))) ;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host (define (rmt:get-latest-host-load hostname) - (rmt:send-receive 'get-latest-host-load 0 (list hostname))) + (rmt:send-receive 'get-latest-host-load #f (list hostname))) (define (rmt:sdb-qry qry val run-id) ;; add caching if qry is 'getid or 'getstr (rmt:send-receive 'sdb-qry run-id (list qry val))) Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -111,10 +111,11 @@ ;; (define tt-server-timeout-param (make-parameter 600)) ;; make ttdat visible (define *server-info* #f) +(define *server-run* #t) (define (tt:make-remote areapath) (make-tt areapath: areapath)) ;; 1 ... or #f @@ -125,29 +126,31 @@ (and (or (number? run-id) (not run-id)) (equal? (dbfile:run-id->dbfname run-id) dbfname))) (tcp-buffer-size 2048) -;; (max-connections 4096) +;; (max-connections 4096) + +(define (tt:get-conn ttdat dbfname) + (hash-table-ref/default (tt-conns ttdat) dbfname #f)) ;; do all the busy work of finding and setting up conn for ;; connecting to a server ;; -(define (tt:client-connect-to-server ttdat dbfname run-id testsuite) +(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc) (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id) (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id) - (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f)) - (server-start-proc (lambda () - (print "dbfname: "dbfname) - (assert (equal? dbfname "main.db") ;; only main.db is started here - "FATAL: called server-start-proc for db other than main.db") - (tt:server-process-run - (tt-areapath ttdat) - testsuite ;; (dbfile:testsuite-name) - (common:find-local-megatest) - run-id)) - )) + (let* ((conn (tt:get-conn ttdat dbfname)) + (server-start-proc (or server-start-proc + (lambda () + (assert (equal? dbfname "main.db") ;; only main.db is started here + "FATAL: called server-start-proc for db other than main.db") + (tt:server-process-run + (tt-areapath ttdat) + testsuite ;; (dbfile:testsuite-name) + (common:find-local-megatest) + run-id))))) (if conn (begin (debug:print-info 2 *default-log-port* "already connected to a server") conn) ;; we are already connected to the server (let* ((sdat (tt:get-current-server-info ttdat dbfname))) @@ -176,21 +179,21 @@ (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good? conn) ((starting) (thread-sleep! 0.5) (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") - (tt:client-connect-to-server ttdat dbfname run-id testsuite)) + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)) (else (let* ((curr-secs (current-seconds))) ;; rm the (last server) would go here (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin (tt-last-serv-start-set! ttdat curr-secs) (server-start-proc))) ;; start server if 10 sec since last attempt (thread-sleep! 1) (debug:print-info 2 *default-log-port* "server ping result was neither running nor starting. Retrying connect") - (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (else ;; no good server found, if haven't started server in > 5 secs, start another (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin (debug:print-info 0 *default-log-port* "Starting server for "dbfname) (server-start-proc) @@ -197,11 +200,11 @@ (tt-last-serv-start-set! ttdat (current-seconds)) (thread-sleep! 3) )) (thread-sleep! 1) (debug:print-info 0 *default-log-port* "Connect to server for " dbfname) - (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) @@ -233,14 +236,14 @@ ;; client side handler ;; ;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; -(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe) +(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc) ;; connect-to-server will start a server if needed. (let* ((areapath (tt-areapath ttdat)) - (conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; looks up conn keyed by dbfname + (conn (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) (match res @@ -560,10 +563,13 @@ ;; in over ten seconds we exit (thread-sleep! 0.05) ;; any real need for delay here? (let loop () (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond + ((not *server-run*) + (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.") + #f) ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) (let* ((res (if db-locked-in #t @@ -629,12 +635,13 @@ (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (curr-secs (current-seconds))) (if (and (eq? (tt-state ttdat) 'running) (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? - (begin - (set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds)) + (let* ((sinfo-file (tt-servinf-file ttdat))) + ;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file) + (set! (file-modification-time sinfo-file) (current-seconds)) ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) (begin @@ -717,17 +724,25 @@ ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other : files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) - (sfiles (glob (conc servdir"/*:"dbfname)))) - + (sfiles (glob (conc servdir"/*:"dbfname))) + (goodfiles '())) + ;; filter the files here by looking in processes table (if we are not main.db) ;; and or look at the time stamp on the servinfo file, a running server will ;; touch the file every minute (again, this will only apply for main.db) - - sfiles)) + (for-each (lambda (fname) + (let* ((age (- (current-seconds)(file-modification-time fname)))) + (if (> age 10) ;; can't trust it if over ten seconds old + (begin + (debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname) + (delete-file fname)) + (set! goodfiles (cons fname goodfiles))))) + sfiles) + goodfiles)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;; @@ -769,10 +784,17 @@ logf)) (else (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat))))))))) +(define *last-server-start* (make-hash-table)) + +(define (tt:too-recent-server-start dbfname) + (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f))) + (and last-run-time + (< (- (current-seconds) last-run-time) 5)))) + ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; @@ -779,55 +801,58 @@ (define (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area (assert areapath "FATAL: tt:server-process-run called without areapath defined.") (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.") (assert mtexe "FATAL: tt:server-process-run called without mtexe defined.") ;; mtest -server - -m testsuite:ext-tests -db 6.db - (let* ((dbfname (dbmod:run-id->dbfname run-id)) - (load (get-normalized-cpu-load)) - (srvrs (tt:find-server areapath dbfname)) - (trying (length srvrs)) - (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) - (cond - ((> load 2.0) - (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") - (thread-sleep! 1) - #f) - ((> nrun 100) - (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") - (thread-sleep! 1) - #f) - ((> trying 2) - (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") - (thread-sleep! 1) - #f) - (else - (if (not (file-exists? (conc areapath"/logs"))) - (create-directory (conc areapath"/logs") #t)) - (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) - (cmdln (conc - mtexe - " -startdir "areapath - " -server - ";; (or target-host "-") - " -m testsuite:"testsuite - " -db "dbfname ;; (dbmod:run-id->dbfname run-id) - " " profile-mode - (conc " >> " logfile " 2>&1 &")))) - ;; we want the remote server to start in *toppath* so push there - ;; (push-directory areapath) ;; use cd in the command line instead - (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) - ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) - - (system cmdln) - ;; ;; use below to go back to nbfake - nbfake does cause trouble ... - ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... - ;; (setenv "NBFAKE_LOG" logfile) - ;; (system (conc "cd "areapath" ; nbfake " cmdln)) - ;; (unsetenv "NBFAKE_QUIET") - ;; (unsetenv "NBFAKE_LOG") - - ;;(pop-directory) - #t))))) + (let* ((dbfname (dbmod:run-id->dbfname run-id))) + (if (tt:too-recent-server-start dbfname) + #f + (let* ((load (get-normalized-cpu-load)) + (srvrs (tt:find-server areapath dbfname)) + (trying (length srvrs)) + (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) + (cond + ((> load 2.0) + (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") + (thread-sleep! 1) + #f) + ((> nrun 100) + (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") + (thread-sleep! 1) + #f) + ((> trying 2) + (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") + (thread-sleep! 1) + #f) + (else + (if (not (file-exists? (conc areapath"/logs"))) + (create-directory (conc areapath"/logs") #t)) + (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) + (cmdln (conc + mtexe + " -startdir "areapath + " -server - ";; (or target-host "-") + " -m testsuite:"testsuite + " -db "dbfname ;; (dbmod:run-id->dbfname run-id) + " " profile-mode + (conc " >> " logfile " 2>&1 &")))) + ;; we want the remote server to start in *toppath* so push there + ;; (push-directory areapath) ;; use cd in the command line instead + (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) + ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) + + (system cmdln) + (hash-table-set! *last-server-start* dbfname (current-seconds)) + ;; ;; use below to go back to nbfake - nbfake does cause trouble ... + ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... + ;; (setenv "NBFAKE_LOG" logfile) + ;; (system (conc "cd "areapath" ; nbfake " cmdln)) + ;; (unsetenv "NBFAKE_QUIET") + ;; (unsetenv "NBFAKE_LOG") + + ;;(pop-directory) + #t))))))) ;;====================================================================== ;; tcp connection stuff ;;======================================================================