Fixed couple issues with the throttle by api load but still not sure it is working right. Cleaned up few other transport things.
Closed-Leaf
check-in: e7dcebe686 user: matt tags: multi-server-hack
Default to treating a log as recent instead of ancient. This might be part of the run-away servers issue
check-in: 31427e83dd user: matt tags: multi-server-hack
Reduced threshold for server pushback on clients to 20 api calls/parallel. Added couple more calls to check if server is running before starting a new one.
check-in: 37e149d637 user: matt tags: multi-server-hack
(debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer")
(print-call-chain (current-error-port))
(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
(vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens
(cond
((not (vector? dat)) ;; it is an error to not receive a vector
(vector #f #f "remote must be called with a vector") )
((> *api-process-request-count* 20)
(vector #f 'overloaded))
(else
(let* ((cmd-in (vector-ref dat 0))
(cmd (if (symbol? cmd-in)
cmd-in
(string->symbol cmd-in)))
(params (vector-ref dat 1))
(start-t (current-milliseconds))
(begin
(debug:print-error 0 *default-log-port* "failed to start or connect to server")
(exit 1))
;;
;; Alternatively here, we can get the list of candidate servers and work our way
;; through them searching for a good one.
;;
(let* ((server-dat (server:get-first-best areapath))
(let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath))
(runremote (or area-dat *runremote*)))
(if (not server-dat) ;; no server found
(client:setup-http areapath remaining-tries: (- remaining-tries 1))
(let ((host (cadr server-dat))
(port (caddr server-dat)))
(debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
(if (and (not area-dat)
(defstruct remote
(hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag )
(server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f))
(last-server-check 0) ;; last time we checked to see if the server was alive
(conndat #f)
(transport *transport-type*)
(server-timeout (or (server:get-timeout) 100))
(force-server #f)) ;; default to 100 seconds (server-timeout (or (server:get-timeout) 100)) ;; default to 100 seconds
(force-server #f)
(ro-mode #f)
(ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode
;; launching and hosts
(defstruct host
(reachable #f)
(last-update 0)
(last-used 0)
(last-cpuload 1))
(read (open-input-string (z3:decode-buffer (base64:base64-decode instr))))))
;; dot-locking egg seems not to work, using this for now
;; if lock is older than expire-time then remove it and try again
;; to get the lock
;;
(define (common:simple-file-lock fname #!key (expire-time 300))
(handle-exceptions
exn
#f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail.
(if (file-exists? fname)
(if (> (- (current-seconds)(file-modification-time fname)) expire-time)
(begin
(delete-file* fname)
(common:simple-file-lock fname expire-time: expire-time))
#f)
(let ((key-string (conc (get-host-name) "-" (current-process-id))))
(with-output-to-file fname
(lambda ()
(print key-string)))
(thread-sleep! 0.25)
(if (file-exists? fname)
(with-input-from-file fname
(lambda ()
(equal? key-string (read-line))))
#f))))
(if (file-exists? fname)
(if (> (- (current-seconds)(file-modification-time fname)) expire-time)
(begin
(delete-file* fname)
(common:simple-file-lock fname expire-time: expire-time))
#f)
(let ((key-string (conc (get-host-name) "-" (current-process-id))))
(with-output-to-file fname
(lambda ()
(print key-string)))
(thread-sleep! 0.25)
(if (file-exists? fname)
(with-input-from-file fname
(lambda ()
(equal? key-string (read-line))))
#f)))))
(define (common:simple-file-lock-and-wait fname #!key (expire-time 300))
(let ((end-time (+ expire-time (current-seconds))))
(let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time)))
(if got-lock
#t
(if (> end-time (current-seconds))
(begin
(thread-sleep! 3)
(loop (common:simple-file-lock fname expire-time: expire-time))
(loop (common:simple-file-lock fname expire-time: expire-time)))
#f)))))
(define (common:simple-file-release-lock fname)
(handle-exceptions
exn
#f ;; I don't really care why this failed (at least for now)
(delete-file* fname))
(delete-file* fname)))
;;======================================================================
;; S T A T E S A N D S T A T U S E S
;;======================================================================
(define *common:std-states*
'((0 "ARCHIVED")
(start-res (http-transport:client-connect host port))
(ping-res (rmt:login-no-auto-client-setup start-res)))
(if (and start-res
ping-res)
(let ((url (http-transport:server-dat-make-url start-res)))
(remote-conndat-set! *runremote* start-res)
(remote-server-url-set! *runremote* url)
(if (server:ping url)
(debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data."))
(debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.")
(begin
(debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url)
(remote-conndat-set! *runremote* #f)
(remote-server-url-set! *runremote* #f))))
(debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.")
)))))))
;; NFS might not have propagated the directory meta data to the run host - give it time if needed
(let loop ((count 0))
(if (or (file-exists? top-path)
(> count 10))
(change-directory top-path)
;; do all the prep locked under the rmt-mutex
(mutex-lock! *rmt-mutex*)
;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
;; 3. do the query, if on homehost use local access
;;
(let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
(areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
(dbfile (conc *toppath* "/megatest.db"))
(readonly-mode (not (file-write-access? dbfile))) ;; TODO: use dbstruct or runremote to figure this out in future
(runremote (or area-dat *runremote*))) (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
(areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
(runremote (or area-dat
*runremote*))
(readonly-mode (if (and runremote
(remote-ro-mode-checked runremote))
(remote-ro-mode runremote)
(let* ((dbfile (conc *toppath* "/megatest.db"))
(ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
(if runremote
(begin
(remote-ro-mode-set! runremote ro-mode)
(remote-ro-mode-checked-set! runremote #t)
ro-mode)
ro-mode)))))
;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
(cond
;; give up if more than 15 attempts
((> attemptnum 15)
(debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.")
(exit 1))
(let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts
(< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time)))
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 8")
(remote-conndat-set! runremote #f)
(mutex-unlock! *rmt-mutex*)
(rmt:send-receive cmd rid params attemptnum: attemptnum))
;; ensure we have a record for our connection for given area
((not runremote)
((not runremote) ;; can remove this one. should never get here.
(set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration
(mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 1")
(rmt:send-receive cmd rid params attemptnum: attemptnum))
;; ensure we have a homehost record
((not (pair? (remote-hh-dat runremote))) ;; not on homehost
(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one
(not (remote-conndat runremote)))
(and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost
(not (remote-conndat runremote)))) ;; and no connection
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
(mutex-unlock! *rmt-mutex*)
(if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
(server:start-and-wait *toppath*)
(server:start-and-wait *toppath*))
(remote-force-server-set! runremote (common:force-server?))
(remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
(rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
;; all set up if get this far, dispatch the query
((and (not (remote-force-server runremote))
(cdr (remote-hh-dat runremote))) ;; we are on homehost
(mutex-unlock! *rmt-mutex*)
(thread-join! log-rotate)
(pop-directory)))
;; given a path to a server log return: host port startseconds
;;
(define (server:logf-get-start-info logf)
(let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs
(handle-exceptions
exn
(list #f #f #f) ;; no idea what went wrong, call it a bad server
(with-input-from-file
logf
(lambda ()
(let loop ((inl (read-line))
(lnum 0))
(if (not (eof-object? inl))
(let ((mlst (string-match rx inl)))
(if (not mlst)
(if (< lnum 500) ;; give up if more than 500 lines of server log read
(loop (read-line)(+ lnum 1))
(list #f #f #f))
(let ((dat (cdr mlst)))
(list (car dat) ;; host
(string->number (cadr dat)) ;; port
(string->number (caddr dat))))))
(list #f #f #f)))))))
(with-input-from-file
logf
(lambda ()
(let loop ((inl (read-line))
(lnum 0))
(if (not (eof-object? inl))
(let ((mlst (string-match rx inl)))
(if (not mlst)
(if (< lnum 500) ;; give up if more than 500 lines of server log read
(loop (read-line)(+ lnum 1))
(list #f #f #f))
(let ((dat (cdr mlst)))
(list (car dat) ;; host
(string->number (cadr dat)) ;; port
(string->number (caddr dat))))))
(list #f #f #f))))))))
;; get a list of servers with all relevant data
;; ( mod-time host port start-time pid )
;;
(define (server:get-list areapath #!key (limit #f))
(let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
(day-seconds (* 24 60 60)))
;; if the directory exists continue to get the list
;; otherwise attempt to create the logs dir and then
;; continue
(if (if (directory-exists? (conc areapath "/logs"))
#t '()
(if (file-write-access? areapath)
(begin
(condition-case
(create-directory (conc areapath "/logs") #t)
(exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
(exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list.")))
(directory-exists? (conc areapath "/logs")))
#f))
'()))
(let* ((server-logs (glob (conc areapath "/logs/server-*.log")))
(num-serv-logs (length server-logs)))
(if (null? server-logs)
'()
(let loop ((hed (car server-logs))
(tal (cdr server-logs))
(res '()))
(let* ((mod-time (handle-exceptions
exn
exn
0 (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
(current-seconds) ;; 0
(file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
(down-time (- (current-seconds) mod-time))
(serv-dat (if (or (< num-serv-logs 10)
(< down-time day-seconds))
(server:logf-get-start-info hed)
'())) ;; don't waste time processing server files not touched in the past day if there are more than ten servers to look at
(serv-rec (cons mod-time serv-dat))
(fmatch (string-match fname-rx hed))