Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -41,22 +41,77 @@ (define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) -(define (client:connect iface port) - (case (server:get-transport) - ((rpc) (rpc:client-connect iface port)) - ((http) (http:client-connect iface port)) - ((zmq) (zmq:client-connect iface port)) - (else (rpc:client-connect iface port)))) - -(define (client:setup run-id #!key (remaining-tries 10) (failed-connects 0)) - (case (server:get-transport) - ((rpc) (rpc-transport:client-setup run-id)) ;;(client:setup-rpc run-id)) - ((http)(client:setup-http run-id)) - (else (rpc-transport:client-setup run-id)))) ;; (client:setup-rpc run-id)))) +;; BB: commenting out orphan code +;;;;; +;; (define (client:connect iface port) +;; (case (server:get-transport) +;; ((rpc) (rpc:client-connect iface port)) +;; ((http) (http:client-connect iface port)) +;; ((zmq) (zmq:client-connect iface port)) +;; (else (rpc:client-connect iface port)))) + +(define (client:setup run-id #!key (remaining-tries 10)) + (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) + (let* ((server-dat (tasks:bb-get-server-info run-id)) + (transport (if server-dat (string->symbol (tasks:hostinfo-get-transport server-dat)) 'noserver))) + + (case transport + ((noserver) ;; no server registered + (if (<= remaining-tries 0) + (begin + (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) + (exit 1)) + (begin + (let ((num-available (tasks:bb-num-in-available-state run-id))) + (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) + (if (< num-available 2) + (server:try-running run-id)) + (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. + (client:setup run-id remaining-tries: (- remaining-tries 1)))))) + ((http)(client:setup-http run-id server-dat remaining-tries)) + ((rpc) (rpc-transport:client-setup run-id server-dat remtries: remaining-tries)) + (else + (debug:print-error 0 *default-log-port* "(6) Transport [" + transport "] specified for run-id [" run-id "] is not implemented in client:setup. Cannot proceed.") + (exit 1))))) + + +;; client:setup-http +;; +;; For http transport, robustly ensure an advertised-running server is actually working and responding, and +;; establish tcp connection to server. For servers marked running but not responding, kill them and clear from mdb +;; +(define (client:setup-http run-id server-dat remaining-tries) + (let* ((iface (tasks:hostinfo-get-interface server-dat)) + (hostname (tasks:hostinfo-get-hostname server-dat)) + (port (tasks:hostinfo-get-port server-dat)) + + (start-res (http-transport:client-connect iface port)) + (ping-res (rmt:login-no-auto-client-setup start-res run-id))) + (if (and start-res ping-res) + (begin + (hash-table-set! *runremote* run-id start-res) ;; side-effect - *runremote* cache init fpr rmt:* + (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) + start-res) + (begin ;; login failed but have a server record, clean out the record and try again + (debug:print-info 0 *default-log-port* "client:setup-http, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) + (http-transport:close-connections run-id) + (hash-table-delete! *runremote* run-id) ;; BB: suspect there is nothing to delete ... + (tasks:kill-server-run-id run-id) ;; -9 so the hung processes dont eat 100% when not responding to sigterm. + (tasks:bb-server-force-clean-run-record run-id iface port + " client:setup-http (server-dat = #t)") + (if (> remaining-tries 8) + (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little + (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time + (server:try-running run-id) + (thread-sleep! 5) ;; give server a little time to start up + (client:setup run-id remaining-tries: (- remaining-tries 1)) + )))) + ;; (define (client:login-no-auto-setup server-info run-id) ;; (case (server:get-transport) ;; ((rpc) (rpc:login-no-auto-client-setup server-info run-id)) ;; ((http) (rmt:login-no-auto-client-setup server-info run-id)) @@ -95,11 +150,11 @@ ;; (begin ;; (debug:print 25 *default-log-port* "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) ;; (thread-sleep! 5) ;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) ;; ;; YUK: rename server-dat here -;; (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id))) +;; (let* ((server-dat (open-run-close tasks:get-server-info tasks:open-db run-id))) ;; (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) ;; (if server-dat ;; (let* ((iface (tasks:hostinfo-get-interface server-dat)) ;; (port (tasks:hostinfo-get-port server-dat)) ;; (start-res (http-transport:client-connect iface port)) @@ -152,71 +207,17 @@ ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; -(define (client:setup-http run-id #!key (remaining-tries 10) (failed-connects 0)) - (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) - (let* ((tdbdat (tasks:open-db))) - (if (<= remaining-tries 0) - (begin - (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) - (exit 1)) - (let* ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) - (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if server-dat - (let* ((iface (tasks:hostinfo-get-interface server-dat)) - (hostname (tasks:hostinfo-get-hostname server-dat)) - (port (tasks:hostinfo-get-port server-dat)) - (start-res (case *transport-type* - ((http)(http-transport:client-connect iface port)) - ;;((nmsg)(nmsg-transport:client-connect hostname port)) - )) - (ping-res (case *transport-type* - ((http)(rmt:login-no-auto-client-setup start-res run-id)) - ;; ((nmsg)(let ((logininfo (rmt:login-no-auto-client-setup start-res run-id))) - ;; (if logininfo - ;; (car (vector-ref logininfo 1)) - ;; #f))) - - ))) - (if (and start-res - ping-res) - (begin - (hash-table-set! *runremote* run-id start-res) - (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) - start-res) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) - (case *transport-type* - ((http)(http-transport:close-connections run-id))) - (hash-table-delete! *runremote* run-id) - (tasks:kill-server-run-id run-id) - (tasks:server-force-clean-run-record (db:delay-if-busy tdbdat) - run-id - (tasks:hostinfo-get-interface server-dat) - (tasks:hostinfo-get-port server-dat) - " client:setup (server-dat = #t)") - (if (> remaining-tries 8) - (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little - (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time - (server:try-running run-id) - (thread-sleep! 5) ;; give server a little time to start up - (client:setup run-id remaining-tries: (- remaining-tries 1)) - ))) - (begin ;; no server registered - (let ((num-available (tasks:num-in-available-state (db:dbdat-get-db tdbdat) run-id))) - (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) - (if (< num-available 2) - (server:try-running run-id)) - (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup run-id remaining-tries: (- remaining-tries 1))))))))) - -;; keep this as a function to ease future -(define (client:start run-id server-info) - (http-transport:client-connect (tasks:hostinfo-get-interface server-info) - (tasks:hostinfo-get-port server-info))) + +;; BB: commenting out orphan code. +;; +;; ;; keep this as a function to ease future +;; (define (client:start run-id server-info) +;; (http-transport:client-connect (tasks:hostinfo-get-interface server-info) +;; (tasks:hostinfo-get-port server-info))) ;; ;; client:signal-handler ;; (define (client:signal-handler signum) ;; (signal-mask! signum) ;; (set! *time-to-exit* #t) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -77,12 +77,15 @@ (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) ;; SERVER (define *my-client-signature* #f) -(define *transport-type* 'http) -(define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg + +;; default preference for transport-type is set here +;; +(define *transport-type* 'rpc) ;; override with [server] transport http|rpc + (define *runremote* (make-hash-table)) ;; if set up for server communication this will hold (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *client-non-blocking-mode* #f) (define *server-id* #f) Index: fs-transport.scm ================================================================== --- fs-transport.scm +++ fs-transport.scm @@ -13,11 +13,11 @@ (use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) (import (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars) -(tcp-buffer-size 2048) +;;(tcp-buffer-size 2048) (declare (unit fs-transport)) (declare (uses common)) (declare (uses db)) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -14,11 +14,15 @@ ;; (import (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing) ;; Configurations for server -(tcp-buffer-size 2048) + +(tcp-buffer-size 2048) ;; this interferes with rpc ; compensating in rpc-transport... so far so good + + + (max-connections 2048) (declare (unit http-transport)) (declare (uses common)) @@ -323,11 +327,11 @@ (define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) (define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3)) (define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4)) (define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5)) -(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6)) +(define (http-transport:server-dat-get-transport vec) (vector-ref vec 6)) (define (http-transport:server-dat-make-url vec) (if (and (http-transport:server-dat-get-iface vec) (http-transport:server-dat-get-port vec)) (conc "http://" @@ -348,11 +352,11 @@ ;; (define (http-transport:client-connect iface port) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) - (server-dat (vector iface port api-uri api-url api-req (current-seconds)))) + (server-dat (vector iface port api-uri api-url api-req (current-seconds) 'http))) server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; @@ -385,11 +389,11 @@ (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature") (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) - (iface (car server-info)) + (iface (car server-info)) ;; BB> this represents ip address, not interface (like eth0 as I would expect from the term) (port (cadr server-info)) (last-access 0) (server-timeout (server:get-timeout))) (let loop ((count 0) (server-state 'available) @@ -535,17 +539,17 @@ (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0))) - (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) + (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin (thread-sleep! 2) - (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) + (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http) (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -68,14 +68,11 @@ (csvr (db:logpro-dat->csv dat stepname)) (csvt (let-values (( (fmt-cell fmt-record fmt-csv) (make-format ","))) (fmt-csv (map list->csv-record csvr)))) (status (configf:lookup dat "final" "exit-status")) (msg (configf:lookup dat "final" "message"))) - ;;(if csvt ;; this if blocked stack dump caused by .dat file from logpro being 0-byte. fixed by upgrading logpro - (rmt:csv->test-data run-id test-id csvt) - ;; (BB> "Error: run-id/test-id/stepname="run-id"/"test-id"/"stepname" => bad csvr="csvr) - ;; ) + (rmt:csv->test-data run-id test-id csvt) (cond ((equal? status "PASS") "PASS") ;; skip the message part if status is pass (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message"))) (else #f))) #f))) @@ -700,11 +697,11 @@ ;; *toppath* ;; side effects: ;; sets; *configdat* (megatest.config info) ;; *runconfigdat* (runconfigs.config info) ;; *configstatus* (status of the read data) -;; +;; *transport-type* (define (launch:setup #!key (force #f)) (let* ((toppath (or *toppath* (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath (runname (common:args-get-runname)) (target (common:args-get-target)) (linktree (common:get-linktree)) @@ -827,10 +824,11 @@ (if (and *toppath* (directory-exists? *toppath*)) (setenv "MT_RUN_AREA_HOME" *toppath*) (begin (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area."))) + (server:set-transport) *toppath*)) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) @@ -1120,11 +1118,11 @@ (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) - (list 'transport (conc *transport-type*)) + (list 'transport (conc (rmt:run-id->transport-type run-id))) ;; (list 'serverinf *server-info*) (list 'toppath *toppath*) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -733,23 +733,22 @@ ;; Start the server - can be done in conjunction with -runall or -runtests (one day...) ;; we start the server if not running else start the client thread ;;====================================================================== (if (args:get-arg "-server") - + ;; Server? Start up here. ;; - (let ((tl (launch:setup)) - (run-id (and (args:get-arg "-run-id") - (string->number (args:get-arg "-run-id")))) - (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) + (let* ((tl (launch:setup)) + (run-id (and (args:get-arg "-run-id") + (string->number (args:get-arg "-run-id"))))) (if run-id - (begin - (server:launch run-id transport-type) + (begin + (server:launch run-id (->string *transport-type*)) (set! *didsomething* #t)) (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) - + ;; Not a server? This section will decide how to communicate ;; ;; Setup client for all expect listed here (if (null? (lset-intersection equal? Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -13,11 +13,12 @@ (declare (unit rmt)) (declare (uses api)) (declare (uses tdb)) (declare (uses http-transport)) -;;(declare (uses nmsg-transport)) +(declare (uses rpc-transport)) + ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;; ;; ;; For debugging add the following to ~/.megatestrc @@ -68,85 +69,115 @@ ;; NB// can cache the answer for server running for 10 seconds ... ;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id)) (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) (client:setup run-id) #f)))) + +(define (rmt:run-id->transport-type rid) + (let* ((run-id (if rid rid 0)) + (connection-info (hash-table-ref/default *runremote* run-id #f))) + ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) + + (if connection-info ;; if we already have a connection for this run-id, use that precendent + ;; use the server if have connection info + (let* ((transport-type (vector-ref connection-info 6))) ;; BB: assumes all transport-type'-servertdat vector's item 6 ids transport type + transport-type) + ;; otherwise pick the global default as preference. (set in common.scm) + *transport-type*))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected - ;; clean out old connections + ;; side-effect: clean out old connections ;; (mutex-lock! *db-multi-sync-mutex*) (let ((expire-time (- (current-seconds) (server:get-timeout) 10))) ;; don't forget the 10 second margin (for-each (lambda (run-id) (let ((connection (hash-table-ref/default *runremote* run-id #f))) (if (and (vector? connection) - (< (http-transport:server-dat-get-last-access connection) expire-time)) + (< (http-transport:server-dat-get-last-access connection) expire-time)) ;; BB> BBTODO: make this generic, not http transport specific. (begin (debug:print-info 0 *default-log-port* "Discarding connection to server for run-id " run-id ", too long between accesses") - ;; bb- disabling nanomsg - ;; SHOULD CLOSE THE CONNECTION HERE - ;; (case *transport-type* - ;; ((nmsg)(nn-close (http-transport:server-dat-get-socket - ;; (hash-table-ref *runremote* run-id))))) (hash-table-delete! *runremote* run-id))))) (hash-table-keys *runremote*))) + ;; (mutex-unlock! *db-multi-sync-mutex*) ;; (mutex-lock! *send-receive-mutex*) (let* ((run-id (if rid rid 0)) (connection-info (rmt:get-connection-info run-id))) ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) (if connection-info ;; use the server if have connection info - (let* ((dat (case *transport-type* + (let* ((transport-type (rmt:run-id->transport-type run-id)) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; Here, we make request to remote server + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + (dat (case transport-type ;; BB: replaced *transport-type* global with run-id specific transport-type ((http)(condition-case (http-transport:client-api-send-receive run-id connection-info cmd params) ((commfail)(vector #f "communications fail")) ((exn)(vector #f "other fail")))) - ;; ((nmsg)(condition-case - ;; (nmsg-transport:client-api-send-receive run-id connection-info cmd params) - ;; ((timeout)(vector #f "timeout talking to server")))) - (else (exit)))) + ((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params)) ;; BB: let us error out for now + (else + (debug:print-error 0 *default-log-port* "(1) Transport [" transport-type + "] specified for run-id [" run-id + "] is not implemented in rmt:send-receive. Cannot proceed." (symbol? transport-type)) + (vector #f (conc "transport ["transport-type"] unimplemented"))))) + + (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) - (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info)) + (if (vector? connection-info)(http-transport:server-dat-update-last-access connection-info)) ;; BB> BBTODO: make this generic, not http transport specific. (if success (begin ;; (mutex-unlock! *send-receive-mutex*) - (case *transport-type* - ((http) res) ;; (db:string->obj res)) - ;; ((nmsg) res) + (case transport-type + ((http rpc) res) ;; (db:string->obj res)) + (else + (debug:print-error 0 *default-log-port* "(2) Transport [" transport-type + "] specified for run-id [" run-id + "] is not implemented in rmt:send-receive. Cannot proceed. Also unexpected since this branch follows success which would follow a suported transport...") + #f) )) ;; (vector-ref res 1))) + + ;; no success... (begin ;; let ((new-connection-info (client:setup run-id))) (debug:print 0 *default-log-port* "WARNING: Communication failed, trying call to rmt:send-receive again.") - ;; (case *transport-type* - ;; ((nmsg)(nn-close (http-transport:server-dat-get-socket connection-info)))) - (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection - ;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. - ;; (if (eq? (modulo attemptnum 5) 0) - ;; (tasks:kill-server-run-id run-id tag: "api-send-receive-failed")) - ;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications - (tasks:start-and-wait-for-server (tasks:open-db) run-id 15) - ;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1)))))) - - ;; no longer killing the server in http-transport:client-api-send-receive - ;; may kill it here but what are the criteria? - ;; start with three calls then kill server - ;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) - ;; (thread-sleep! 2) - (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))))) - ;; no connection info? try to start a server, or access locally if no + (case transport-type + + ((http rpc) + (hash-table-delete! *runremote* run-id) ;; don't keep using the same connection + ;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. + ;; (if (eq? (modulo attemptnum 5) 0) + ;; (tasks:kill-server-run-id run-id tag: "api-send-receive-failed")) + ;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications + (tasks:start-and-wait-for-server (tasks:open-db) run-id 15) + ;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1)))))) + + ;; no longer killing the server in http-transport:client-api-send-receive + ;; may kill it here but what are the criteria? + ;; start with three calls then kill server + ;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) + ;; (thread-sleep! 2) + (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))) + (else + (debug:print-error 0 *default-log-port* "(3) Transport [" transport-type + "] specified for run-id [" run-id + "] is not implemented in rmt:send-receive. Cannot proceed.") + #f))))) + + ;; no connection info; try to start a server, or access locally if no ;; server and the query is read-only ;; ;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call ;; - (if (and (< attemptnum 15) + (if (and (< attemptnum 15) (member cmd api:write-queries)) - (let ((faststart (configf:lookup *configdat* "server" "faststart"))) + (let* ((faststart (configf:lookup *configdat* "server" "faststart"))) (hash-table-delete! *runremote* run-id) ;; (mutex-unlock! *send-receive-mutex*) (if (and faststart (equal? faststart "no")) (begin (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) @@ -315,14 +346,17 @@ ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; (define (rmt:login-no-auto-client-setup connection-info run-id) - (case *transport-type* + (case (rmt:run-id->transport-type run-id) ((http)(rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*))) - ;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*))) - )) + (else + (debug:print-error 0 *default-log-port* "(4) Transport [" *transport-type* + "] specified for run-id [" run-id + "] is not implemented in rmt:send-receive. Cannot proceed.") + (exit 1)))) ;; hand off a call to one of the db:queries statements ;; added run-id to make looking up the correct db possible ;; (define (rmt:general-call stmtname run-id . params) Index: rpc-transport.scm ================================================================== --- rpc-transport.scm +++ rpc-transport.scm @@ -21,208 +21,565 @@ (declare (uses tests)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. (include "common_records.scm") (include "db_records.scm") + +(define *heartbeat-mutex* (make-mutex)) +(define *server-loop-heart-beat* (current-seconds)) + ;; procstr is the name of the procedure to be called as a string -(define (rpc-transport:autoremote procstr params) - (handle-exceptions - exn - (begin - (debug:print 1 *default-log-port* "Remote failed for " proc " " params) - (apply (eval (string->symbol procstr)) params)) - ;; (if *runremote* - ;; (apply (eval (string->symbol (conc "remote:" procstr))) params) - (apply (eval (string->symbol procstr)) params))) +(define (rpc-transport:autoremote procstr params) ;; may be unused, I think api-exec deprecates this one. + (let* ((procsym (if (symbol? procstr) + procstr + (string->symbol (->string procstr)))) + (res + (begin + (apply (eval procsym) params)))) + res)) + + +;; rpc receiver +(define (rpc-transport:api-exec cmd params) + (let* ( (resdat (api:execute-requests *inmemdb* (vector cmd params))) ;; #( flag result ) + (flag (vector-ref resdat 0)) + (res (vector-ref resdat 1))) + + (mutex-lock! *heartbeat-mutex*) + + (set! *last-db-access* (current-seconds)) ;; bump *last-db-access*; this will renew keep-running thread's lease on life for another (server:get-timeout) seconds + (BB> "in api-exec; last-db-access updated to "*last-db-access*) + (mutex-unlock! *heartbeat-mutex*) + + res)) + + + ;; (handle-exceptions + ;; exn + ;; (begin + ;; (debug:print 0 *default-log-port* "Remote failed for " proc " " params " exn="exn) + ;; (apply (eval (string->symbol procstr)) params)) + ;; ;; (if *runremote* + ;; ;; (apply (eval (string->symbol (conc "remote:" procstr))) params) + ;; (apply (eval (string->symbol procstr)) params))) + +;; retry an operation (depends on srfi-18) +;; ================== +;; idea here is to avoid spending time on coding retrying something. Trying to be generic here. +;; +;; Exception handling: +;; ------------------- +;; if evaluating the thunk results in exception, it will be retried. +;; on last try, if final-failure-returns-actual is true, the exception will be re-thrown to caller. +;; +;; look at options below #!key to see how to configure behavior +;; +;; +(define (retry-thunk + the-thunk + #!key ;;;; options below + (accept-result? (lambda (x) x)) ;; retry if predicate applied to thunk's result is false + (retries 4) ;; how many tries + (failure-value #f) ;; return this on final failure, unless following option is enabled: + (final-failure-returns-actual #f) ;; on failure, on the last try, just return the result, not failure-value + + (retry-delay 0.1) ;; delay between tries + (back-off-factor 1) ;; multiply retry-delay by this factor on retry + (random-delay 0.1) ;; add a random portion of this value to wait + + (chatty #f) ;; print status as we go, for debugging. + ) + + (when chatty (print) (print "Entered retry-thunk") (print "-=-=-=-=-=-")) + (let* ((guarded-thunk ;; we are guarding the thunk against exceptions. We will record whether result of evaluation is an exception or a regular result. + (lambda () + (let* ((EXCEPTION (gensym)) ;; using gensym to avoid potential collision + (res + (condition-case + (the-thunk) ;; this is what we are guarding the execution of + [x () (cons EXCEPTION x)] + ))) + (cond + ((and (pair? res) (eq? (car res) EXCEPTION)) + (if chatty + (print " - the-thunk threw exception >"(cdr res)"<")) + (cons 'exception (cdr res))) + (else + (if chatty + (print " - the-thunk returned result >"res"<")) + (cons 'regular-result res))))))) + + (let loop ((guarded-res (guarded-thunk)) + (retries-left retries) + (fail-wait retry-delay)) + (if chatty (print " ==========")) + (let* ((wait-time (+ fail-wait (+ (* fail-wait back-off-factor) + (* random-delay + (/ (random 1024) 1024) )))) + (res-type (car guarded-res)) + (res-value (cdr guarded-res))) + (cond + ((and (eq? res-type 'regular-result) (accept-result? res-value)) + (if chatty (print " + return result that satisfied accept-result? >"res-value"<")) + res-value) + + ((> retries-left 0) + (if chatty (print " - sleep "wait-time)) + (thread-sleep! wait-time) + (if chatty (print " + retry ["retries-left" tries left]")) + (loop (guarded-thunk) + (sub1 retries-left) + wait-time)) + + ((eq? res-type 'regular-result) + (if final-failure-returns-actual + (begin + (if chatty (print " + last try failed- return the result >"res-value"<")) + res-value) + (begin + (if chatty (print " + last try failed- return canned failure value >"failure-value"<")) + failure-value))) + + (else ;; no retries left; result was not accepted and res-type can only be 'exception + (if final-failure-returns-actual + (begin + (if chatty (print " + last try failed with exception- re-throw it >"res-value"<")) + (abort res-value)); re-raise the exception. TODO: find a way for call-history to show as though from entry to this function + (begin + (if chatty (print " + last try failed with exception- return canned failure value >"failure-value"<")) + failure-value)))))))) + + +(define (rpc-transport:server-shutdown server-id rpc:listener #!key (from-on-exit #f)) + (on-exit (lambda () #t)) ;; turn off on-exit stuff + ;;(tcp-close rpc:listener) ;; gotta exit nicely + ;;(tasks:bb-server-set-state! server-id "stopped") + + + ;; TODO: (low) the following is extraordinaritly slow. Maybe we don't even need portlogger for rpc anyway?? the exception-based failover when ports are taken is fast! + ;;(portlogger:open-run-close portlogger:set-port (rpc:default-server-port) "released") + + (set! *time-to-exit* #t) + (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) + (tasks:bb-server-delete-record server-id " rpc-transport:keep-running complete") + (BB> "Before (exit) (from-on-exit="from-on-exit")") + (unless from-on-exit (exit)) ;; sometimes we hang (around) here with 100% cpu. + (BB> "After") + ;; strace reveals endless: + ;; getrusage(RUSAGE_SELF, {ru_utime={413, 917868}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 9874}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 13874}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 105880}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 109880}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 201886}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 205886}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 297892}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 301892}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 393898}, ru_stime={0, 60003}, ...}) = 0 + ;; getrusage(RUSAGE_SELF, {ru_utime={414, 397898}, ru_stime={0, 60003}, ...}) = 0 + ;; make a post to chicken-users w/ http://paste.call-cc.org/paste?id=60a4b66a29ccf7d11359ea866db642c970735978 + (if from-on-exit + ;; avoid above condition! End current process externally since 1 in 20 (exit)'s result in hung, 100% cpu zombies. (see above) + (system (conc "kill -9 "(current-process-id)))) + ) + ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (rpc-transport:launch run-id) - (let* ((tdbdat (tasks:open-db))) - (BB> "rpc-transport:launch fired for run-id="run-id) - (set! *run-id* run-id) - (if (args:get-arg "-daemonize") - (daemon:ize)) - (if (server:check-if-running run-id) - (begin - (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") - (exit 0))) - (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)) - (remtries 4)) - (if (not server-id) - (if (> remtries 0) - (begin - (thread-sleep! 2) - (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id) - (- remtries 1))) - (begin - ;; since we didn't get the server lock we are going to clean up and bail out - (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") - (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " rpc-transport:launch"))) - (begin - (rpc-transport:run (if (args:get-arg "-server")(args:get-arg "-server") "-") run-id server-id) - (exit)))))) + (set! *run-id* run-id) + + ;; send to background if requested + (when (args:get-arg "-daemonize") + (daemon:ize) + (when *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it + (current-error-port *alt-log-file*) + (current-output-port *alt-log-file*))) + + ;; double check we dont alrady have a running server for this run-id + (when (server:check-if-running run-id) + (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") + (exit 0)) + + ;; let's get a server-id for this server + ;; if at first we do not suceed, try 3 more times. + (let ((server-id (retry-thunk + (lambda () (tasks:bb-server-lock-slot run-id 'rpc)) + chatty: #f + retries: 4))) + (when (not server-id) ;; dang we couldn't get a server-id. + ;; since we didn't get the server lock we are going to clean up and bail out + (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") + (tasks:bb-server-delete-records-for-this-pid " rpc-transport:launch") + (exit 1)) + + ;; we got a server-id (and a corresponding entry in servers table in globally shared mdb) + ;; all systems go. Proceed to setup rpc server. + (rpc-transport:run + (if (args:get-arg "-server") + (args:get-arg "-server") + "-") + run-id + server-id) + (exit))) + +(define *rpc-listener-port* #f) +(define *rpc-listener-port-bind-timestamp* #f) + +(define *on-exit-flag #f) + +(define (rpc-transport:server-dat-get-iface vec) (vector-ref vec 0)) +(define (rpc-transport:server-dat-get-port vec) (vector-ref vec 1)) +(define (rpc-transport:server-dat-get-last-access vec) (vector-ref vec 5)) +(define (rpc-transport:server-dat-get-transport vec) (vector-ref vec 6)) +(define (rpc-transport:server-dat-update-last-access vec) + (if (vector? vec) + (vector-set! vec 5 (current-seconds)) + (begin + (print-call-chain (current-error-port)) + (debug:print-error 0 *default-log-port* "call to rpc-transport:server-dat-update-last-access with non-vector!!")))) + + +(define *api-exec-ht* (make-hash-table)) + +;; let's see if caching the rpc stub curbs thread-profusion on server side +(define (rpc-transport:get-api-exec iface port) + (let* ((lu (hash-table-ref/default *api-exec-ht* '(iface . port) #f))) + (if lu + lu + (let ((res (rpc:procedure 'api-exec iface port))) + (hash-table-set! *api-exec-ht* '(iface . port) res) + res)))) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; this client-side procedure makes rpc call to server and returns result +;; +(define (rpc-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3)) + (if (not (vector? serverdat)) + (BB> "WHAT?? for run-id="run-id", serverdat="serverdat)) + (let* ((iface (rpc-transport:server-dat-get-iface serverdat)) + (port (rpc-transport:server-dat-get-port serverdat)) + (res #f) + (api-exec (rpc-transport:get-api-exec iface port)) + (send-receive (lambda () + (tcp-buffer-size 0) + (set! res (retry-thunk + (lambda () + (condition-case + ;;(vector #t (run-remote cmd params)) + (vector 'success (api-exec cmd params)) + [x (exn i/o net) (vector 'comms-fail (conc "communications fail ["(->string x)"]") x)] + [x () (vector 'other-fail "other fail ["(->string x)"]" x)])) + chatty: #f + accept-result?: (lambda(x) + (and (vector? x) (vector-ref x 0))) + retries: 4 + back-off-factor: 1.5 + random-wait: 0.2 + retry-delay: 0.1 + final-failure-returns-actual: #t)) + res + )) + (th1 (make-thread send-receive "send-receive")) + (time-out-reached #f) + (time-out (lambda () + (thread-sleep! 45) + (set! time-out-reached #t) + (thread-terminate! th1) + #f)) + + (th2 (make-thread time-out "time out"))) + (thread-start! th1) + (thread-start! th2) + (thread-join! th1) + (thread-terminate! th2) + (debug:print-info 11 *default-log-port* "got res=" res) + (if (vector? res) + (case (vector-ref res 0) + ((success) (vector #t (vector-ref res 1))) + ((comms-fail) + (debug:print 0 *default-log-port* "WARNING: comms failure for rpc request") + ;;(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (vector #f (vector-ref res 1))) + (else + (debug:print-error 0 *default-log-port* "error occured at server, info=" (vector-ref res 1)) + (debug:print 0 *default-log-port* " client call chain:") + (print-call-chain (current-error-port)) + (debug:print 0 *default-log-port* " server call chain:") + (pp (vector-ref res 1) (current-error-port)) + (signal (vector-ref res 2)))) + (signal (make-composite-condition + (make-property-condition + 'timeout + 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))) + + (define (rpc-transport:run hostn run-id server-id) (debug:print 2 *default-log-port* "Attempting to start the rpc server ...") ;; (trace rpc:publish-procedure!) - (rpc:publish-procedure! 'server:login server:login) - (rpc:publish-procedure! 'testing (lambda () "Just testing")) + ;;====================================================================== + ;; start of publish-procedure section + ;;====================================================================== + (rpc:publish-procedure! 'server:login server:login) ;; this allows client to validate it is the same megatest instance as the server. No security here, just making sure we're in the right room. + (rpc:publish-procedure! + 'testing + (lambda () + "Just testing")) + + ;; procedure to receive arbitrary API request from client's rpc:send-receive/rpc-transport:client-api-send-receive + (rpc:publish-procedure! 'rpc-transport:autoremote rpc-transport:autoremote) + ;; can use this to run most anything at the remote + (rpc:publish-procedure! 'api-exec rpc-transport:api-exec) + + + ;;====================================================================== + ;; end of publish-procedure section + ;;====================================================================== + (let* ((db #f) - (hostname (get-host-name)) - (ipaddrstr (let ((ipstr (if (string=? "-" hostn) + (hostname (let ((res (get-host-name))) res)) + (server-start-time (current-seconds)) + (server-timeout (server:get-timeout)) + (ipaddrstr (let* ((ipstr (if (string=? "-" hostn) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") (server:get-best-guess-address hostname) - #f))) - (if ipstr ipstr hostn))) ;; hostname))) - (start-port (open-run-close tasks:server-get-next-port tasks:open-db)) + #f)) + (res (if ipstr ipstr hostn))) + res)) ;; hostname))) + (start-port (let ((res (portlogger:open-run-close portlogger:find-port))) ;; BB> TODO: remove portlogger! + res)) (link-tree-path (configf:lookup *configdat* "setup" "linktree")) - (rpc:listener (rpc-transport:find-free-port-and-open (rpc:default-server-port))) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; rpc:listener is the tcp-listen result from inside the find-free-port-and-open complex. + ;; It is our handle on the listening tcp port + ;; We will attach this to our rpc server with rpc:make-server in thread th1 . + (rpc:listener (rpc-transport:find-free-port-and-open start-port)) (th1 (make-thread (lambda () - ((rpc:make-server rpc:listener) #t)) + ((rpc:make-server rpc:listener) #t) ) "rpc:server")) - ;; (cute (rpc:make-server rpc:listener) "rpc:server") - ;; 'rpc:server)) - (hostname (if (string=? "-" hostn) + + + (hostname (if (string=? "-" hostn) (get-host-name) hostn)) (ipaddrstr (if (string=? "-" hostn) (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") #f)) - (portnum (rpc:default-server-port)) - (host:port (conc (if ipaddrstr ipaddrstr hostname) ":" portnum)) - (tdb (tasks:open-db))) + (portnum (let ((res (rpc:default-server-port))) res)) + (host:port (conc (if ipaddrstr ipaddrstr hostname) ":" portnum))) + + + ;; BB> TODO: remove portlogger! + ;; if rpc found it needed a different port than portlogger provided, keep portlogger in the loop. + ;; (when (not (equal? start-port portnum)) + ;; (BB> "portlogger proffered "start-port" but rpc grabbed "portnum) + ;; (portlogger:open-run-close portlogger:set-port start-port "released") + ;; (portlogger:open-run-close portlogger:take-port portnum)) + + (tasks:bb-server-set-interface-port server-id ipaddrstr portnum) + + ;;============================================================ + ;; activate thread th1 to attach opened tcp port to rpc server + ;;============================================================= (thread-start! th1) (set! db *inmemdb*) - (open-run-close tasks:server-set-interface-port - tasks:open-db - server-id - ipaddrstr portnum) + (debug:print 0 *default-log-port* "Server started on " host:port) - ;; (trace rpc:publish-procedure!) - ;; (rpc:publish-procedure! 'server:login server:login) - ;; (rpc:publish-procedure! 'testing (lambda () "Just testing")) - - ;;====================================================================== - ;; ;; end of publish-procedure section - ;;====================================================================== - ;; + + (thread-sleep! 4) + (if (rpc-transport:self-test run-id ipaddrstr portnum) + (debug:print 0 *default-log-port* "INFO: rpc self test passed!") + (begin + (debug:print 0 *default-log-port* "Error: rpc listener did not pass self test. Shutting down. On: " host:port) + (exit))) + (on-exit (lambda () - (open-run-close tasks:server-set-state! tasks:open-db server-id "stopped"))) - - (set! *rpc:listener* rpc:listener) - (tasks:server-set-state! tdb server-id "running") - (set! *inmemdb* (db:setup run-id)) - ;; if none running or if > 20 seconds since - ;; server last used then start shutdown - (let loop ((count 0)) - (thread-sleep! 5) ;; no need to do this very often - (let ((numrunning -1)) ;; (db:get-count-tests-running db))) - (if (or (> numrunning 0) - (> (+ *last-db-access* 60)(current-seconds))) - (begin - (debug:print-info 0 *default-log-port* "Server continuing, tests running: " numrunning ", seconds since last db access: " (- (current-seconds) *last-db-access*)) - (loop (+ 1 count))) - (begin - (debug:print-info 0 *default-log-port* "Starting to shutdown the server side") - (open-run-close tasks:server-delete-record tasks:open-db server-id " rpc-transport:try-start-server stop") - (thread-sleep! 10) - (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) - (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") - )))))) - -(define (rpc-transport:find-free-port-and-open port) + (rpc-transport:server-shutdown server-id rpc:listener from-on-exit: #t))) + + ;; check again for running servers for this run-id in case one has snuck in since we checked last in rpc-transport:launch + (if (not (equal? server-id (tasks:bb-server-am-i-the-server? run-id)));; try to ensure no double registering of servers + (begin ;; i am not the server, another server snuck in and beat this one to the punch + (tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port + (tasks:bb-server-set-state! server-id "collision")) + + (begin ;; i am the server + ;; setup the in-memory db + (set! *inmemdb* (db:setup run-id)) + (db:get-db *inmemdb* run-id) + + ;; let's make it official + (set! *rpc:listener* rpc:listener) + (tasks:bb-server-set-state! server-id "running") ;; update our mdb servers entry + + + + ;; this let loop will hold open this thread until we want the server to shut down. + ;; if no requests received within the last 20 seconds : + ;; database hasnt changed in ?? + ;; + + ;; begin new loop + ;; keep-running loop: polls last-db-access to see if we have timed out. + (let loop ((count 0) + (bad-sync-count 0)) + + ;; Use this opportunity to sync the inmemdb to db + (let ((start-time (current-milliseconds)) + (sync-time #f) + (rem-time #f)) + ;; inmemdb is a dbstruct + (condition-case + (db:sync-touched *inmemdb* *run-id* force-sync: #t) + ((sync-failed)(cond + ((> bad-sync-count 10) ;; time to give up + (rpc-transport:server-shutdown server-id rpc:listener)) + (else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop + (thread-sleep! 5) + (loop count (+ bad-sync-count 1))))) + ((exn) + (debug:print-error 0 *default-log-port* "error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server") + (rpc-transport:server-shutdown server-id rpc:listener))) + (set! sync-time (- (current-milliseconds) start-time)) + (set! rem-time (quotient (- 4000 sync-time) 1000)) + (debug:print 4 *default-log-port* "SYNC: time= " sync-time ", rem-time=" rem-time) + + (if (and (<= rem-time 4) + (> rem-time 0)) + (thread-sleep! rem-time) + (thread-sleep! 4))) ;; fallback for if the math is changed ... + + (if (< count 1) ;; 3x3 = 9 secs aprox + (loop (+ count 1) bad-sync-count)) + + ;; BB: don't see how this is possible with RPC + ;; ;; Check that iface and port have not changed (can happen if server port collides) + ;; (mutex-lock! *heartbeat-mutex*) + ;; (set! sdat *server-info*) + ;; (mutex-unlock! *heartbeat-mutex*) + + ;; (if (or (not (equal? sdat (list iface port))) + ;; (not server-id)) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info") + ;; (set! iface (car sdat)) + ;; (set! port (cadr sdat)))) + + ;; Transfer *last-db-access* to last-access to use in checking that we are still alive + (mutex-lock! *heartbeat-mutex*) + (set! last-access *last-db-access*) + (BB> "in rpc-transport:run ; last-access="last-access) + (mutex-unlock! *heartbeat-mutex*) + + ;; (debug:print 11 *default-log-port* "last-access=" last-access ", server-timeout=" server-timeout) + ;; + ;; no_traffic, no running tests, if server 0, no running servers + ;; + ;; (let ((wait-on-running (configf:lookup *configdat* "server" b"wait-on-running"))) ;; wait on running tasks (if not true then exit on time out) + ;; + (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) + (adjusted-timeout (if (> hrs-since-start 1) + (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour + server-timeout))) + (if (common:low-noise-print 120 "server timeout") + (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) + (if (and *server-run* + (> (+ last-access server-timeout) + (current-seconds))) + (begin + (if (common:low-noise-print 120 "server continuing") + (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) + ;; + ;; Consider implementing some smarts here to re-insert the record or kill self is + ;; the db indicates so + ;; + (if (tasks:bb-server-am-i-the-server? run-id) + (tasks:bb-server-set-state! server-id "running")) + ;; + (loop 0 bad-sync-count)) + (begin + (BB> "SERVER SHUTDOWN CALLED! last-access="last-access" current-seconds="(current-seconds)" server-timeout="server-timeout) + (rpc-transport:server-shutdown server-id rpc:listener))))) + ;; end new loop + )))) + + +(define (rpc-transport:find-free-port-and-open port #!key ) (handle-exceptions exn - (begin + (begin (print "Failed to bind to port " (rpc:default-server-port) ", trying next port") - (rpc-transport:find-free-port-and-open (+ port 1))) + (rpc-transport:find-free-port-and-open (add1 port))) (rpc:default-server-port port) + (set! *rpc-listener-port* port) ;; a bit paranoid about rpc:default-server-port parameter not changing across threads (as params are wont to do). keeping this global in my back pocket in case this causes problems + (set! *rpc-listener-port-bind-timestamp* (current-milliseconds)) ;; may want to test how long it has been since the last bind attempt happened... (tcp-read-timeout 240000) - (tcp-listen (rpc:default-server-port) 10000))) - + (tcp-buffer-size 0) ;; gotta do this because http-transport undoes it. + (tcp-listen (rpc:default-server-port) 10000) + )) + (define (rpc-transport:ping run-id host port) (handle-exceptions exn (begin - (print "SERVER_NOT_FOUND") + (print "SERVER_NOT_FOUND exn="exn) (exit 1)) (let ((login-res ((rpc:procedure 'server:login host port) *toppath*))) - (if (and (list? login-res) - (car login-res)) + (if login-res (begin (print "LOGIN_OK") (exit 0)) (begin (print "LOGIN_FAILED") (exit 1)))))) -(define (rpc-transport:client-setup run-id #!key (remtries 10)) - (if *runremote* - (begin - (debug:print-error 0 *default-log-port* "Attempt to connect to server but already connected") - #f) - (let* ((host-info (hash-table-ref/default *runremote* run-id #f))) ;; (open-run-close db:get-var #f "SERVER")) - (if host-info - (let ((iface (car host-info)) - (port (cadr host-info)) - (ping-res ((rpc:procedure 'server:login host port) *toppath*))) - (if ping-res - (let ((server-dat (list iface port #f #f #f))) - (hash-table-set! *runremote* run-id server-dat) - server-dat) - (begin - (server:try-running run-id) - (thread-sleep! 2) - (rpc-transport:client-setup run-id (- remtries 1))))) - (let* ((server-db-info (open-run-close tasks:get-server tasks:open-db run-id))) - (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if server-db-info - (let* ((iface (tasks:hostinfo-get-interface server-db-info)) - (port (tasks:hostinfo-get-port server-db-info)) - (server-dat (list iface port #f #f #f)) - (ping-res ((rpc:procedure 'server:login host port) *toppath*))) - (if start-res - (begin - (hash-table-set! *runremote* run-id server-dat) - server-dat) - (begin - (server:try-running run-id) - (thread-sleep! 2) - (rpc-transport:client-setup run-id (- remtries 1))))) - (begin - (server:try-running run-id) - (thread-sleep! 2) - (rpc-transport:client-setup run-id (- remtries 1))))))))) -;; -;; (port (if (and hostinfo (> (length hostdat) 1))(cadr hostdat) #f))) -;; (if (and port -;; (string->number port)) -;; (let ((portn (string->number port))) -;; (debug:print-info 2 *default-log-port* "Setting up to connect to host " host ":" port) -;; (handle-exceptions -;; exn -;; (begin -;; (debug:print-error 0 *default-log-port* "Failed to open a connection to the server at host: " host " port: " port) -;; (debug:print 0 *default-log-port* " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn)) -;; ;; (open-run-close -;; ;; (lambda (db . param) -;; ;; (sqlite3:execute db "DELETE FROM metadat WHERE var='SERVER'")) -;; ;; #f) -;; (set! *runremote* #f)) -;; (if (and (not (args:get-arg "-server")) ;; no point in the server using the server using the server -;; ((rpc:procedure 'server:login host portn) *toppath*)) -;; (begin -;; (debug:print-info 2 *default-log-port* "Logged in and connected to " host ":" port) -;; (set! *runremote* (vector host portn))) -;; (begin -;; (debug:print-info 2 *default-log-port* "Failed to login or connect to " host ":" port) -;; (set! *runremote* #f))))) -;; (debug:print-info 2 *default-log-port* "no server available"))))) - +(define (rpc-transport:self-test run-id host port) + (tcp-buffer-size 0) ;; gotta do this because http-transport undoes it. + (let* ((testing-res ((rpc:procedure 'testing host port))) + (login-res ((rpc:procedure 'server:login host port) *toppath*)) + (res (and login-res (equal? testing-res "Just testing")))) + + (if login-res + (begin + (BB> "Self test PASS. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) + #t) + (begin + (BB> "Self test fail. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) + + #f)) + res)) + +(define (rpc-transport:client-setup run-id server-dat #!key (remtries 10)) + (tcp-buffer-size 0) + (debug:print-info 0 *default-log-port* "rpc-transport:client-setup run-id="run-id" server-dat=" server-dat ", remaining-tries=" remtries) + (let* ((iface (tasks:hostinfo-get-interface server-dat)) + (hostname (tasks:hostinfo-get-hostname server-dat)) + (port (tasks:hostinfo-get-port server-dat)) + (runremote-server-dat (vector iface port #f #f #f (current-seconds) 'rpc)) ;; http version := (vector iface port api-uri api-url api-req (current-seconds) 'http ) + (ping-res (retry-thunk (lambda () ;; make 3 attempts to ping. + ((rpc:procedure 'server:login iface port) *toppath*)) + retries: 3))) + ;; we got here from rmt:get-connection-info on the condition that *runremote* has no entry for run-id... + (if ping-res + (begin + (debug:print-info 0 *default-log-port* "rpc-transport:client-setup CONNECTION ESTABLISHED run-id="run-id" server-dat=" server-dat) + (hash-table-set! *runremote* run-id runremote-server-dat) ;; side-effect - *runremote* cache init fpr rmt:* + runremote-server-dat) + (begin ;; login failed but have a server record, clean out the record and try again + (tasks:kill-server-run-id run-id) + (tasks:bb-server-force-clean-run-record run-id iface port + " rpc-transport:client-setup (server-dat = #t)") + (if (> remtries 2) + (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little + (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time + (server:try-running run-id) + (thread-sleep! 5) ;; give server a little time to start up + (client:setup run-id remaining-tries: (sub1 remtries)) + " rpc-transport:client-setup (server-dat = #t)")))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -48,34 +48,40 @@ ;; all routes though here end in exit ... ;; ;; start_server ;; (define (server:launch run-id transport-type) - (BB> "server:launch fired for run-id="run-id" transport-type="transport-type) - (case transport-type - ((http)(http-transport:launch run-id)) - ;;((nmsg)(nmsg-transport:launch run-id)) - ((rpc) (rpc-transport:launch run-id)) - (else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))) -;; (else (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc") -;; (rpc-transport:launch run-id))))) + (let ((ttype (if (symbol? transport-type) transport-type (string->symbol (->string transport-type))))) + (case ttype + ((http)(http-transport:launch run-id)) + ;;((nmsg)(nmsg-transport:launch run-id)) + ((rpc) (rpc-transport:launch run-id)) + (else (debug:print-error 0 *default-log-port* "unknown server type " ttype))))) + ;; (else (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc") + ;; (rpc-transport:launch run-id))))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== -;; Get the transport -(define (server:get-transport) - (if *transport-type* - *transport-type* - (let ((ttype (string->symbol - (or (args:get-arg "-transport") - (configf:lookup *configdat* "server" "transport") - "rpc")))) - (set! *transport-type* ttype) - ttype))) - +;; set global *transport-type* based on -transport switch and serer/transport configuration. default http otherwise. +;; called by launch:setup +(define (server:set-transport) + (let ((ttype (string->symbol + (or (args:get-arg "-transport") + (configf:lookup *configdat* "server" "transport") + "rpc")))) + (set! *transport-type* ttype) + ttype)) + +;; Get the transport -- DO NOT call this from client code. In client code, this is run-id sensitive and not a global +;; For code communicating with existing run-id with a server, use: (rmt:run-id->transport-type run-id) + (define (server:get-transport) + (if *transport-type* + *transport-type* + (server:set-transport))) + ;; Generate a unique signature for this server (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string (lambda () @@ -180,26 +186,32 @@ (server:run run-id) (rmt:start-server run-id))) (define (server:check-if-running run-id) (let ((tdbdat (tasks:open-db))) - (let loop ((server (tasks:get-server (db:delay-if-busy tdbdat) run-id)) + (let loop ((server (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)) (trycount 0)) (if server ;; note: client:start will set *runremote*. this needs to be changed ;; also, client:start will login to the server, also need to change that. ;; ;; client:start returns #t if login was successful. ;; - (let ((res (case *transport-type* + (let* ((transport-type (rmt:run-id->transport-type run-id)) + (res (case transport-type ((http)(server:ping-server run-id (tasks:hostinfo-get-interface server) (tasks:hostinfo-get-port server))) - ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) - ;; (tasks:hostinfo-get-port server) - ;; timeout: 2)) - ))) + ((rpc) (rpc-transport:ping run-id + (tasks:hostinfo-get-interface server) + (tasks:hostinfo-get-port server))) + + (else + (debug:print-error 0 *default-log-port* "(5) Transport [" transport-type + "] specified for run-id [" run-id + "] is not implemented in rmt:send-receive. Cannot proceed.") + (exit 1))))) ;; if the server didn't respond we must remove the record (if res #t (begin (debug:print-info 0 *default-log-port* "server at " server " not responding, removing record") @@ -215,11 +227,11 @@ (let* ((host-port (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f))) (toppath (launch:setup)) - (server-db-dat (if (not host-port)(tasks:get-server (db:delay-if-busy tdbdat) run-id) #f))) + (server-db-dat (if (not host-port)(tasks:get-server-info (db:delay-if-busy tdbdat) run-id) #f))) (if (not run-id) (begin (debug:print-error 0 *default-log-port* "must specify run-id when doing ping, -run-id n") (print "ERROR: No run-id") (exit 1)) @@ -254,20 +266,22 @@ ((NOREPLY) #f) ((LOGIN_OK) #t) (else #f)) (loop (read-line) inl)))))) + +;; Client will call this procedure on the server via the low-level transport (http/rpc/etc) to verify its toppath matches the server's toppath. +;; A true result means client and server are associated with same megatest instance, share the same megatest.config, etc...) A false result means the client should not talk to this server. (define (server:login toppath) - (lambda (toppath) - (set! *last-db-access* (current-seconds)) - (if (equal? *toppath* toppath) - (begin - ;; (debug:print-info 2 *default-log-port* "login successful") - #t) - (begin - ;; (debug:print-info 2 *default-log-port* "login failed") - #f)))) + (set! *last-db-access* (current-seconds)) + (if (equal? *toppath* toppath) + (begin + ;; (debug:print-info 2 *default-log-port* "login successful") + #t) + (begin + ;; (debug:print-info 2 *default-log-port* "login failed") + #f))) (define (server:get-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (string->number tmo)) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -170,21 +170,21 @@ (define (tasks:hostinfo-get-pubport vec) (vector-ref vec 3)) (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) -(define (tasks:server-lock-slot mdb run-id) +(define (tasks:server-lock-slot mdb run-id transport-type) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin - (tasks:server-set-available mdb run-id) + (tasks:server-set-available mdb run-id transport-type) (thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) -(define (tasks:server-set-available mdb run-id) +(define (tasks:server-set-available mdb run-id transport-type) (sqlite3:execute mdb "INSERT INTO servers (pid,hostname,port,pubport,start_time, priority,state,mt_version,heartbeat, interface,transport,run_id) VALUES(?, ?, ?, ?, strftime('%s','now'), ?, ?, ?,-1,?, ?, ?);" (current-process-id) ;; pid @@ -194,11 +194,11 @@ (random 1000) ;; priority (used a tiebreaker on get-available) "available" ;; state (common:version-signature) ;; mt_version -1 ;; interface ;; (conc (server:get-transport)) ;; transport - (conc *transport-type*) ;; transport + (conc transport-type) ;; transport run-id )) (define (tasks:num-in-available-state mdb run-id) (let ((res 0)) @@ -291,10 +291,14 @@ (if (> remtries 0) (loop (get-rand-port)(- remtries 1)) (get-rand-port)) port)))))) +;; there can be multiple servers spawned for the same runid. we want exactly zero or one servers per runid. The caller is a nascent server. It wants to know if it should proceed or if it is redundant. this function chooses a winner and tells me if I am the winner. Alternative is lots of runaway servers. Nobody wants that, trust me. +;; +;; algo: get all server info entries for this runid. Each nascent server will insert an entry for its runid before getting here. Entries are visible globally. If current hostname and current processid match first entry, then yes I am the server; return server-id as my prize for winning. Otherwise, I am not the server; return #f. +;; (define (tasks:server-am-i-the-server? mdb run-id) (let* ((all (tasks:server-get-servers-vying-for-run-id mdb run-id)) (first (if (null? all) #f;; (begin (debug:print-error 0 *default-log-port* "no servers listed, should be at least one by now.") ;; (sqlite3:finalize! mdb) @@ -327,27 +331,62 @@ mdb (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;") run-id) (vector header res))) -(define (tasks:get-server mdb run-id #!key (retries 10)) + +;; BB> bb opinion - want to push responsibility into api (encapsulation), like waiting if db is busy and finding the db handle in the first place. why should the caller need to be concerned?? If my opinion carries, we'll remove the bb- and make other needful adjustments. +(define (bb-mdb-inserter mdb-expecting-proc mdbless-args) + (let ((mdb (db:delay-if-busy (tasks:open-db)))) + (apply mdb-expecting-proc (cons mdb mdbless-args)))) + +(define (tasks:bb-server-force-clean-run-record . args) + (bb-mdb-inserter tasks:server-force-clean-run-record args)) + +(define (tasks:bb-server-lock-slot . args) + (bb-mdb-inserter tasks:server-lock-slot args)) + +(define (tasks:bb-server-set-interface-port . args) + (bb-mdb-inserter tasks:server-set-interface-port args)) + +(define (tasks:bb-server-am-i-the-server? . args) + (bb-mdb-inserter tasks:server-am-i-the-server? args)) + +(define (tasks:bb-server-set-state! . args) + (bb-mdb-inserter tasks:server-set-state! args)) + +(define (tasks:bb-get-server-info . args) + (bb-mdb-inserter tasks:get-server-info args)) + +(define (tasks:bb-num-in-available-state . args) + (bb-mdb-inserter tasks:num-in-available-state args)) + +(define (tasks:bb-server-delete-records-for-this-pid . args) + (bb-mdb-inserter tasks:server-delete-records-for-this-pid args)) + +(define (tasks:bb-server-delete-record . args) + (bb-mdb-inserter tasks:server-delete-record args)) + + +;; BB: renaming tasks:get-server to get-server-info to make clear we aren't creating servers here +(define (tasks:get-server-info mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn (begin (print-call-chain (current-error-port)) - (debug:print 0 *default-log-port* "WARNING: tasks:get-server db access error.") + (debug:print 0 *default-log-port* "WARNING: tasks:get-server-info db access error.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* " for run " run-id) (print-call-chain (current-error-port)) (if (> retries 0) (begin - (debug:print 0 *default-log-port* " trying call to tasks:get-server again in 10 seconds") + (debug:print 0 *default-log-port* " trying call to tasks:get-server-info again in 10 seconds") (thread-sleep! 10) - (tasks:get-server mdb run-id retries: (- retries 0))) - (debug:print 0 *default-log-port* "10 tries of tasks:get-server all crashed and burned. Giving up and returning \"no server found\""))) + (tasks:get-server-info mdb run-id retries: (- retries 0))) + (debug:print 0 *default-log-port* "10 tries of tasks:get-server-info all crashed and burned. Giving up and returning \"no server found\""))) (sqlite3:for-each-row (lambda (id interface port pubport transport pid hostname) (set! res (vector id interface port pubport transport pid hostname))) mdb ;; removed: @@ -394,21 +433,21 @@ ;; try to start a server and wait for it to be available ;; (define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; ensure a server is running for this run - (let loop ((server-dat (tasks:get-server (db:delay-if-busy tdbdat) run-id)) + (let loop ((server-dat (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)) (delay-time 0)) (if (and (not server-dat) (< delay-time delay-max-tries)) (begin (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) (debug:print 0 *default-log-port* "Try starting server for run-id " run-id)) (thread-sleep! (/ (random 2000) 1000)) (server:kind-run run-id) (thread-sleep! (min delay-time 1)) - (loop (tasks:get-server (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)))))) + (loop (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)))))) (define (tasks:get-all-servers mdb) (let ((res '())) (sqlite3:for-each-row (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) @@ -443,18 +482,18 @@ ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) - (sdat (tasks:get-server (db:delay-if-busy tdbdat) run-id))) + (sdat (tasks:get-server-info (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) - (tasks:kill-server hostname pid) + (tasks:kill-server hostname pid kill-switch: "-9") ;; BB: added -9, let's not be kind here. we need it to die so it isn't a 100% cpu zombie (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) ))