Index: commonmod.scm ================================================================== --- commonmod.scm +++ commonmod.scm @@ -3649,11 +3649,11 @@ (define (server:expiration-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below (* 3600 (string->number tmo)) - 600))) ;; default is ten minutes + 60))) ;; default is one minute (define (runs:get-mt-env-alist run-id runname target testname itempath) ;;(bb-check-path msg: "runs:set-megatest-env-vars entry") `(("MT_TEST_NAME" . ,testname) Index: docs/manual/server.dot ================================================================== --- docs/manual/server.dot +++ docs/manual/server.dot @@ -17,61 +17,65 @@ digraph G { subgraph cluster_1 { node [style=filled,shape=box]; - check_available_queue -> remove_entries_over_10s_old; - remove_entries_over_10s_old -> set_available [label="num_avail < 3"]; - remove_entries_over_10s_old -> exit [label="num_avail > 2"]; - - set_available -> delay_2s; - delay_2s -> check_place_in_queue; - - check_place_in_queue -> "http:transport-launch" [label="at head"]; - check_place_in_queue -> exit [label="not at head"]; - - "client:login" -> "server:shutdown" [label="login failed"]; - "server:shutdown" -> exit; - - subgraph cluster_2 { - "http:transport-launch" -> "http:transport-run"; - "http:transport-launch" -> "http:transport-keep-running"; - - "http:transport-keep-running" -> "tests running?"; - "tests running?" -> "client:login" [label=yes]; - "tests running?" -> "server:shutdown" [label=no]; - "client:login" -> delay_5s [label="login ok"]; - delay_5s -> "http:transport-keep-running"; - } - - // start_server -> "server_running?"; - // "server_running?" -> set_available [label="no"]; - // "server_running?" -> delay_2s [label="yes"]; - // delay_2s -> "still_running?"; - // "still_running?" -> ping_server [label=yes]; - // "still_running?" -> set_available [label=no]; - // ping_server -> exit [label=alive]; - // ping_server -> remove_server_record [label=dead]; - // remove_server_record -> set_available; - // set_available -> avail_delay [label="delay 3s"]; - // avail_delay -> "first_in_queue?"; - // - // "first_in_queue?" -> set_running [label=yes]; - // set_running -> get_next_port -> handle_requests; - // "first_in_queue?" -> "dead_entry_in_queue?" [label=no]; - // "dead_entry_in_queue?" -> "server_running?" [label=no]; - // "dead_entry_in_queue?" -> "remove_dead_entries" [label=yes]; - // remove_dead_entries -> "server_running?"; - // - // handle_requests -> start_shutdown [label="no traffic\nno running tests"]; - // handle_requests -> shutdown_request; - // start_shutdown -> shutdown_delay; - // shutdown_request -> shutdown_delay; - // shutdown_delay -> exit; - - label = "server:launch"; - color=brown; + rmt:send-receive -> "init-*remote* if needed" -> rmt:general-open-connection -> + rmt:send-receive-real; + + +// check_available_queue -> remove_entries_over_10s_old; +// remove_entries_over_10s_old -> set_available [label="num_avail < 3"]; +// remove_entries_over_10s_old -> exit [label="num_avail > 2"]; +// +// set_available -> delay_2s; +// delay_2s -> check_place_in_queue; +// +// check_place_in_queue -> "http:transport-launch" [label="at head"]; +// check_place_in_queue -> exit [label="not at head"]; +// +// "client:login" -> "server:shutdown" [label="login failed"]; +// "server:shutdown" -> exit; +// +// subgraph cluster_2 { +// "http:transport-launch" -> "http:transport-run"; +// "http:transport-launch" -> "http:transport-keep-running"; +// +// "http:transport-keep-running" -> "tests running?"; +// "tests running?" -> "client:login" [label=yes]; +// "tests running?" -> "server:shutdown" [label=no]; +// "client:login" -> delay_5s [label="login ok"]; +// delay_5s -> "http:transport-keep-running"; +// } +// +// // start_server -> "server_running?"; +// // "server_running?" -> set_available [label="no"]; +// // "server_running?" -> delay_2s [label="yes"]; +// // delay_2s -> "still_running?"; +// // "still_running?" -> ping_server [label=yes]; +// // "still_running?" -> set_available [label=no]; +// // ping_server -> exit [label=alive]; +// // ping_server -> remove_server_record [label=dead]; +// // remove_server_record -> set_available; +// // set_available -> avail_delay [label="delay 3s"]; +// // avail_delay -> "first_in_queue?"; +// // +// // "first_in_queue?" -> set_running [label=yes]; +// // set_running -> get_next_port -> handle_requests; +// // "first_in_queue?" -> "dead_entry_in_queue?" [label=no]; +// // "dead_entry_in_queue?" -> "server_running?" [label=no]; +// // "dead_entry_in_queue?" -> "remove_dead_entries" [label=yes]; +// // remove_dead_entries -> "server_running?"; +// // +// // handle_requests -> start_shutdown [label="no traffic\nno running tests"]; +// // handle_requests -> shutdown_request; +// // start_shutdown -> shutdown_delay; +// // shutdown_request -> shutdown_delay; +// // shutdown_delay -> exit; +// +// label = "server:launch"; +// color=brown; } // client_start_server -> start_server; // handle_requests -> read_write; // read_write -> handle_requests; Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -245,13 +245,15 @@ (start-main-srv)))) ;; NB// remote is a rmt:remote struct ;; (define (rmt:general-open-connection remote apath dbname #!key (num-tries 5)) - (let ((mdbname (db:run-id->dbname #f))) + (let* ((mdbname (db:run-id->dbname #f)) + (mconn (rmt:get-conn remote apath mdbname))) (cond - ((not (rmt:get-conn remote apath mdbname)) ;; no channel open to main? + ((or (not mconn) ;; no channel open to main? + (< (rmt:conn-expires mconn)(+ (current-seconds) 2))) ;; restablish connection if less than 2 seconds on the lease (rmt:open-main-connection remote apath) (thread-sleep! 2) (rmt:general-open-connection remote apath mdbname)) ((not (rmt:get-conn remote apath dbname)) ;; no channel open to dbname? (let* ((res (rmt:send-receive-real remote apath mdbname 'get-server `(,apath ,dbname)))) @@ -295,35 +297,19 @@ ;; sometime in the future ;; (define (rmt:send-receive-real remote apath dbname cmd params) (let* ((conn (rmt:get-conn remote apath dbname))) (assert conn "FATAL: rmt:send-receive-real called without the needed channels opened") - (pp (rmt:conn->alist conn)) - ;; (rmt:send-receive-setup conn) -;; (let-values (((i o) (tcp-connect (rmt:conn-ipaddr conn) -;; (rmt:conn-port conn)))) (let* ((key #f) (host (rmt:conn-ipaddr conn)) (port (rmt:conn-port conn)) (payload `((cmd . ,cmd) (key . ,(rmt:conn-srvkey conn)) (params . ,params))) (res (open-send-receive-nn (conc host":"port) (sexpr->string payload)))) - ;; begin - ;; (write payload o) ;; (rmt:conn-outport conn)) - ;; (with-input-from-port - ;; i ;; (rmt:conn-inport conn) - ;; read)))) - ;; (close-input-port i) - ;; (close-output-port o) (string->sexpr res)))) -;; (if (string? res) -;; (string->sexpr res) -;; res)))) - - ;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed ;; sometime in the future. ;; ;; Purpose - call the main.db server and request a server be started @@ -338,11 +324,11 @@ ;; read-string))) ;; (string->sexpr res)))) (define (rmt:print-db-stats) (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" - (debug:print 18 *default-log-port* "DB Stats\n========") + (debug:print 18 *default-log-port* "DB Stats, "(seconds->year-week/day-time (current-seconds))"\n=====================") (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) (for-each (lambda (cmd) (let ((cmd-dat (hash-table-ref *db-stats* cmd))) (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0)))))) (sort (hash-table-keys *db-stats*) @@ -1464,36 +1450,35 @@ (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") (if (and no-hurry (debug:debug-mode 18)) (rmt:print-db-stats)) (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds (if *server-info* - (let ((pkt-file (conc (get-pkts-dir *toppath*) - "/" (servdat-uuid *server-info*) - ".pkt")) - (dbfile (servdat-dbfile *server-info*))) + (let ((dbfile (servdat-dbfile *server-info*))) (if dbfile (begin ;; do a final sync here - (if (string-match ".*/main.db$" dbfile) - (begin - (debug:print-info 0 *default-log-port* "removing pkt "pkt-file) - (delete-file* pkt-file) - (debug:print-info 0 *default-log-port* "Releasing lock for "dbfile) - (db:with-lock-db (servdat-dbfile *server-info*) - (lambda (dbh dbfile) - (db:release-lock dbh dbfile)))) - (let* ((sdat *server-info*)) ;; we have a run-id server - (rmt:send-receive-real *rmt:remote* *toppath* - (db:run-id->dbname #f) - 'deregister-server - `(,(servdat-uuid sdat) - ,(current-process-id) - ,(servdat-host sdat) ;; iface - ,(servdat-port sdat))))))))) - (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated + (if (string-match ".*/main.db$" dbfile) + (let ((pkt-file (conc (get-pkts-dir *toppath*) + "/" (servdat-uuid *server-info*) + ".pkt"))) + (debug:print-info 0 *default-log-port* "removing pkt "pkt-file) + (delete-file* pkt-file) + (debug:print-info 0 *default-log-port* "Releasing lock for "dbfile) + (db:with-lock-db (servdat-dbfile *server-info*) + (lambda (dbh dbfile) + (db:release-lock dbh dbfile)))) + (let* ((sdat *server-info*)) ;; we have a run-id server + (rmt:send-receive-real *rmt:remote* *toppath* + (db:run-id->dbname #f) + 'deregister-server + `(,(servdat-uuid sdat) + ,(current-process-id) + ,(servdat-host sdat) ;; iface + ,(servdat-port sdat))))))))) + ;; (if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated (if (bdat-task-db *bdat*) ;; TODO: Check that this is correct for task db (let ((db (cdr (bdat-task-db *bdat*)))) (if (sqlite3:database? db) (begin (sqlite3:interrupt! db) @@ -1507,11 +1492,11 @@ (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") (if no-hurry (begin (thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff (begin - (thread-sleep! 2))) + (thread-sleep! 2))) (debug:print 4 *default-log-port* " ... done") ) "clean exit"))) (thread-start! th1) (thread-start! th2) @@ -1962,18 +1947,19 @@ (equal? sdat last-sdat) sdat)))))))) (define (rmt:register-server remote apath iface port server-key dbname) (rmt:open-main-connection remote apath) ;; we need a channel to main.db - (rmt:send-receive-real remote apath ;; params: host port servkey pid ipaddr dbpath - (db:run-id->dbname #f) 'register-server `(,iface - ,port - ,server-key - ,(current-process-id) - ,iface - ,apath - ,dbname))) + (rmt:send-receive-real remote apath ;; params: host port servkey pid ipaddr dbpath + (db:run-id->dbname #f) + 'register-server `(,iface + ,port + ,server-key + ,(current-process-id) + ,iface + ,apath + ,dbname))) (define (http-transport:wait-for-stable-interface #!optional (num-tries-allowed 100)) ;; wait until *server-info* stops changing (let* ((stime (current-seconds))) (let loop ((last-host #f) @@ -2036,11 +2022,12 @@ (port (servdat-port *server-info*))) (let loop ((count 0) (bad-sync-count 0) (start-time (current-milliseconds))) - (if (not is-main) + (if (and (not is-main) + (common:low-noise-print 60 "servdat-status")) (debug:print-info 0 *default-log-port* "servdat-status is " (servdat-status *server-info*))) ;; set up the database handle (mutex-lock! *heartbeat-mutex*) (if (not *dbstruct-db*) ;; no db opened yet, open the db and register with main if appropriate