added script manyservers.sh which will be basis for a test of server start & collision resilency
check-in: 3d418034bd user: bjbarcla tags: v1.63-server-fix
updated server launch to handle collisions gracefully (but introduced other issues - moving to sidebranch to debug)
check-in: c9880665a8 user: bjbarcla tags: v1.63-server-fix
)
0)
(define (std-signal-handler signum)
;; (signal-mask! signum)
(set! *time-to-exit* #t)
;;(BB> "got signal "signum)
(debug:print-error 0 *default-log-port* "Received signal " signum " exiting promptly")
;; (std-exit-procedure) ;; shouldn't need this since we are exiting and it will be called anyway
(exit))
(set-signal-handler! signal/int std-signal-handler) ;; ^C
(set-signal-handler! signal/term std-signal-handler)
;; (set-signal-handler! signal/stop std-signal-handler) ;; ^Z NO, do NOT handle ^Z!
(last-access 0)
(server-timeout (server:get-timeout))
(server-going #f))
(let loop ((count 0)
(server-state 'available)
(bad-sync-count 0)
(start-time (current-milliseconds)))
;;(BB> "http-transport: top of loop; count="count" server-state="server-state" bad-sync-count="bad-sync-count" server-going="server-going)
;; Use this opportunity to sync the tmp db to megatest.db
(if (not server-going) ;; *dbstruct-db*
;; Removed code is pasted below (keeping it around until we are clear it is not needed).
;; no *dbstruct-db* yet, set running after our first pass through and start the db
(if (eq? server-state 'available)
(let ((new-server-id (tasks:server-am-i-the-server? (db:delay-if-busy tdbdat) run-id))) ;; try to ensure no double registering of servers
(if (equal? new-server-id server-id)
(begin
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
;;(BB> "http-transport: ->dbprep")
(thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access
(set! *dbstruct-db* (db:setup)) ;; run-id))
(set! server-going #t)
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
;;(BB> "http-transport: ->running")
(server:write-dotserver *toppath* (conc iface ":" port))
(thread-start! *watchdog*)
(delete-file* (conc *toppath* "/.starting-server")))
(server:complete-attempt *toppath*))
(begin ;; gotta exit nicely
;;(BB> "http-transport: ->collision")
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
(http-transport:server-shutdown server-id port))))))
;; when things go wrong we don't want to be doing the various queries too often
;; so we strive to run this stuff only every four seconds or so.
(let* ((sync-time (- (current-milliseconds) start-time))
(rem-time (quotient (- 4000 sync-time) 1000)))
(if (and (<= rem-time 4)
(> rem-time 0))
(thread-sleep! rem-time)))
(let ((targ (or (args:get-arg "-reqtarg")(args:get-arg "-target"))))
(if targ (setenv "MT_TARGET" targ)))
;; The watchdog is to keep an eye on things like db sync etc.
;;
(define *watchdog* (make-thread common:watchdog "Watchdog thread"))
(if (not (args:get-arg "-server"))
(thread-start! *watchdog*)
(BB> "thread-start! watchdog")
(thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog
;;(BB> "thread-start! watchdog")
(if (args:get-arg "-log")
(let ((oup (open-output-file (args:get-arg "-log"))))
(debug:print-info 0 *default-log-port* "Sending log output to " (args:get-arg "-log"))
(set! *default-log-port* oup)))
(if (or (args:get-arg "-h")
(args:get-arg "-help")
;;======================================================================
;; Exit and clean up
;;======================================================================
(if (not *didsomething*)
(debug:print 0 *default-log-port* help))
(BB> "thread-join! watchdog")
(thread-join! *watchdog*)
;;(BB> "thread-join! watchdog")
;; join the watchdog thread if it has been thread-start!ed (it may not have been started in the case of a server that never enters running state)
;; (symbols returned by thread-state: created ready running blocked suspended sleeping terminated dead)
(if (thread? *watchdog*)
(case (thread-state *watchdog*)
((ready running blocked sleeping terminated dead)
(thread-join! *watchdog*))))
(set! *time-to-exit* #t)
(if (not (eq? *globalexitstatus* 0))
(if (or (args:get-arg "-run")(args:get-arg "-runtests")(args:get-arg "-runall"))
(begin
(debug:print 0 *default-log-port* "NOTE: Subprocesses with non-zero exit code detected: " *globalexitstatus*)
(exit 0))
(case *globalexitstatus*
((0)(exit 0))
((1)(exit 1))
((2)(exit 2))
(else (exit 3)))))
;;
;; all routes though here end in exit ...
;;
;; start_server
;;
(define (server:launch run-id transport-type)
(BB> "server:launch fired for run-id="run-id" transport-type="transport-type)
;;(BB> "server:launch fired for run-id="run-id" transport-type="transport-type)
(let ((attempt-in-progress (server:start-attempted? *toppath*)))
(when attempt-in-progress
(debug:print-info 0 *default-log-port* "Server start attempt in progress in other process (=> "attempt-in-progress"<=). Aborting server launch attempt in this process ("(current-process-id)")")
(exit)))
(let ((dotserver-url (server:check-if-running *toppath*)))
(when dotserver-url
(debug:print-info 0 *default-log-port* "Server already running (=> "dotserver-url"<=). Aborting server launch attempt in this process ("(current-process-id)")")
(exit)
))
(case transport-type
((http)(http-transport:launch run-id))
;;((nmsg)(nmsg-transport:launch run-id))
((rpc) (rpc-transport:launch run-id))
(else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))))
;; (else (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc")
;; (rpc-transport:launch run-id)))))
;; Given a run id start a server process ### NOTE ### > file 2>&1
;; if the run-id is zero and the target-host is set
;; try running on that host
;; incidental: rotate logs in logs/ dir.
;;
(define (server:run areapath) ;; areapath is ignored for now.
(let* ((curr-host (get-host-name))
(attempt-in-progress (server:start-attempted? areapath))
(dot-server-url (server:check-if-running areapath))
(curr-ip (server:get-best-guess-address curr-host))
(curr-pid (current-process-id))
(homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
(target-host (car homehost))
(testsuite (common:get-testsuite-name))
(logfile (conc *toppath* "/logs/server.log"))
(logfile (conc areapath "/logs/server.log"))
(cmdln (conc (common:get-megatest-exe)
" -server " (or target-host "-") " -run-id " 0 (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
(conc " -daemonize -log " logfile)
"")
" -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &")))))
(log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")))
;; we want the remote server to start in *toppath* so push there
(push-directory *toppath*)
(debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
(thread-start! log-rotate)
(push-directory areapath)
(cond
(attempt-in-progress
(debug:print 0 *default-log-port* "INFO: Not trying to start server because attempt is in progress: "attempt-in-progress))
(dot-server-url
(debug:print 0 *default-log-port* "INFO: Not trying to start server because one is already running : "dot-server-url))
(else
(debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
(thread-start! log-rotate)
;; host.domain.tld match host?
(if (and target-host
;; look at target host, is it host.domain.tld or ip address and does it
;; match current ip or hostname
(not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
(not (equal? curr-ip target-host)))
(begin
(debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
(setenv "TARGETHOST" target-host)))
(setenv "TARGETHOST_LOGF" logfile)
(common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever
(system (conc "nbfake " cmdln))
(unsetenv "TARGETHOST_LOGF")
(if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
(thread-join! log-rotate)
(pop-directory)))
;; host.domain.tld match host?
(if (and target-host
;; look at target host, is it host.domain.tld or ip address and does it
;; match current ip or hostname
(not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
(not (equal? curr-ip target-host)))
(begin
(debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
(setenv "TARGETHOST" target-host)))
(setenv "TARGETHOST_LOGF" logfile)
(common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever
(system (conc "nbfake " cmdln))
(unsetenv "TARGETHOST_LOGF")
(if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
(thread-join! log-rotate)
(pop-directory)))))
(define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value.
(if *my-client-signature* *my-client-signature*
(let ((sig (server:mk-signature)))
(set! *my-client-signature* sig)
*my-client-signature*)))
(set! *my-client-signature* sig)
*my-client-signature*)))
;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
(define (server:kind-run areapath)
(let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f)))
(if (or (not last-run-time)
(> (- (current-seconds) last-run-time) 30))
(begin
(server:run areapath)
(hash-table-set! *server-kind-run* areapath (current-seconds))))))
;; The generic run a server command. Dispatches the call to server 0 if run-id != 0
;;
;; (define (server:try-running run-id)
;; (if (eq? run-id 0)
;; (server:run run-id)
;; (rmt:start-server run-id)))
(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.
(define (server:attempting-start areapath)
(with-output-to-file
(conc areapath "/.starting-server")
(lambda ()
(print (current-process-id) " on " (get-host-name)))))
(define (server:complete-attempt areapath)
(delete-file* (conc areapath "/.starting-server")))
(define (server:start-attempted? areapath)
(let ((flagfile (conc areapath "/.starting-server")))
(handle-exceptions
exn
#f ;; if things go wrong pretend we can't see the file
(cond
(and (file-exists? flagfile)
(< (- (current-seconds)
(file-modification-time flagfile))
15))))) ;; exists and less than 15 seconds old
((and (file-exists? flagfile)
(< (- (current-seconds)
(file-modification-time flagfile))
15)) ;; exists and less than 15 seconds old
(with-input-from-file flagfile (lambda () (read-line))))
((file-exists? flagfile) ;; it is stale.
(server:complete-attempt areapath)
#f)
(else #f)))))
(define (server:read-dotserver areapath)
(let ((dotfile (conc areapath "/.server")))
(handle-exceptions
exn
#f ;; if things go wrong pretend we can't see the file
(if (and (file-exists? dotfile)
(if dotserver
(let* ((res (case *transport-type*
((http)(server:ping-server dotserver))
;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
)))
(if res
dotserver
(begin
(server:remove-dotserver-file areapath ".*") ;; remove stale dotserver
#f))
#f)))
#f)))
;; called in megatest.scm, host-port is string hostname:port
;;
;; NOTE: This is NOT called directly from clients as not all transports support a client running
;; in the same process as the server.
;;