Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -129,24 +129,50 @@ (tcp-buffer-size 2048) ;; (max-connections 4096) ;; do all the busy work of finding and setting up conn for ;; connecting to a server +;; This function, `tt:client-connect-to-server`, is designed to manage connections between a client and a server within a testing framework. +;; The function takes four arguments: +;; 1. `ttdat`: a data structure that holds information about the testing environment or connections. +;; 2. `dbfname`: The name of the database file that the client wants to connect to. +;; 3. `run-id`: An identifier for the current run of the test suite. +;; 4. `testsuite`: The test suite that is being run. +;; +;; Here's a step-by-step explanation of what the function does: +;; +;; 1. It first asserts that the `run-id` is valid for the given `dbfname` using the `tt:valid-run-id` function. If the `run-id` is not valid, it raises a fatal error. +;; 2. It prints debug information indicating that the function `tt:client-connect-to-server` has been called with the given `dbfname`. +;; 3. It attempts to retrieve an existing connection to the server from a hash table (`tt-conns`) using the `dbfname` as the key. If a connection already exists, it prints debug information and returns the existing connection. +;; 4. If no existing connection is found, it retrieves the current server information from the servinfo file, using the `tt:get-current-server-info` function. +;; 5. It uses pattern matching to destructure the server information into variables (`host`, `port`, `start-time`, `server-id`, `pid`, `dbfname2`, `servinffile`). It then asserts that the `dbfname` from the server info matches the one provided to the function. +;; 6. It constructs a connection object (`conn`) with the server information. +;; 7. It attempts to ping the server using `tt:timed-ping` to verify that the server is running and can be communicated with. +;; 8. Depending on the result of the ping: +;; - If the server is running (`running`), it prints debug information, saves the connection in the hash table, and returns the connection. +;; - If the server is starting (`starting`), it sleeps for 2 seconds and then recursively calls itself to retry the connection. +;; - If the server is neither running nor starting, it checks if it's been more than 10 seconds since the last server start attempt. If so, it attempts to start the server using `server-start-proc` and then sleeps for 1 second before retrying the connection. +;; 9. If no server information is found (`else` case), it checks if it's been more than 3 seconds since the last server start attempt. If so, it starts a new server using `server-start-proc`, updates the last server start time, and sleeps for 4 seconds. +;; 10. It then sleeps for 1 second and prints debug information before recursively calling itself to retry the connection. +;; +;; The function uses recursion to keep trying to connect to the server, with various sleep intervals to prevent overwhelming the system with connection attempts or server starts. +;; It also uses a hash table to cache connections and avoid reconnecting to a server if a connection already exists. +;; The function is designed to handle different server states and ensure that a server is running and available before returning a valid connection to the caller. ;; (define (tt:client-connect-to-server ttdat dbfname run-id testsuite) (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id) - (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id) + (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname) (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f)) (server-start-proc (lambda () (tt:server-process-run (tt-areapath ttdat) testsuite ;; (dbfile:testsuite-name) (common:find-local-megatest) run-id)))) (if conn (begin - (debug:print-info 2 *default-log-port* "already connected to a server") + (debug:print-info 2 *default-log-port* "already connected to a server for " dbfname) conn) ;; we are already connected to the server (let* ((sdat (tt:get-current-server-info ttdat dbfname))) (match sdat ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") @@ -189,11 +215,11 @@ (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin (debug:print-info 0 *default-log-port* "Starting server for "dbfname) (server-start-proc) (tt-last-serv-start-set! ttdat (current-seconds)) - (thread-sleep! 3) + (thread-sleep! 4) )) (thread-sleep! 1) (debug:print-info 0 *default-log-port* "Connect to server for " dbfname) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) @@ -537,11 +563,11 @@ (if (tt-cleanup-proc ttdat) ((tt-cleanup-proc ttdat))) (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct))) - (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) + (debug:print-info 0 *default-log-port* "keep-running: removing lock for file "dbtmpname) (db:no-sync-del! db dbfname) )))))) (set! *server-info* ttdat) (let loop ((count 0)) (if (> count 240)