;; Copyright 2006-2017, Matthew Welland.
;;
;; This file is part of Megatest.
;;
;; Megatest is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; Megatest is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see <http://www.gnu.org/licenses/>.
;;
(require-extension (srfi 18) extras tcp s11n)
(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest
directory-utils posix-extras matchable utils)
(use spiffy uri-common intarweb http-client spiffy-request-vars)
(declare (unit server))
(declare (uses commonmod))
(declare (uses common))
(declare (uses db))
(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.
;; (declare (uses synchash))
(declare (uses http-transport))
;;(declare (uses rpc-transport))
(declare (uses launch))
;; (declare (uses daemon))
(import commonmod)
(include "common_records.scm")
(include "db_records.scm")
(define (server:make-server-url hostport)
(if (not hostport)
#f
(conc "http://" (car hostport) ":" (cadr hostport))))
(define *server-loop-heart-beat* (current-seconds))
;;======================================================================
;; P K T S S T U F F
;;======================================================================
;; ???
;;======================================================================
;; P K T S S T U F F
;;======================================================================
;; ???
;;======================================================================
;; S E R V E R
;;======================================================================
;; Call this to start the actual server
;;
;;======================================================================
;; S E R V E R U T I L I T I E S
;;======================================================================
;; Get the transport
(define (server:get-transport)
(if *transport-type*
*transport-type*
(let ((ttype (string->symbol
(or (args:get-arg "-transport")
(configf:lookup *configdat* "server" "transport")
"rpc"))))
(set! *transport-type* ttype)
ttype)))
;; Generate a unique signature for this server
(define (server:mk-signature)
(message-digest-string (md5-primitive)
(with-output-to-string
(lambda ()
(write (list (current-directory)
(current-process-id)
(argv)))))))
(define (server:get-client-signature)
(if *my-client-signature* *my-client-signature*
(let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic
(set! *my-client-signature* sig)
*my-client-signature*)))
(define (server:get-server-id)
(if *server-id* *server-id*
(let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic
(set! *server-id* sig)
*server-id*)))
;; ;; When using zmq this would send the message back (two step process)
;; ;; with spiffy or rpc this simply returns the return data to be returned
;; ;;
;; (define (server:reply return-addr query-sig success/fail result)
;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
;; ;; (send-message pubsock target send-more: #t)
;; ;; (send-message pubsock
;; (case (server:get-transport)
;; ((rpc) (db:obj->string (vector success/fail query-sig result)))
;; ((http) (db:obj->string (vector success/fail query-sig result)))
;; ((fs) result)
;; (else
;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
;; result)))
;; Given an area path, start a server process ### NOTE ### > file 2>&1
;; if the target-host is set
;; try running on that host
;; incidental: rotate logs in logs/ dir.
;;
(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area
(let* ((testsuite (common:get-testsuite-name))
(logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
(profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
""))
(cmdln (conc (common:get-megatest-exe)
" -server - ";; (or target-host "-")
(if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
" -daemonize "
"")
;; " -log " logfile
" -m testsuite:" testsuite
" " profile-mode
)) ;; (conc " >> " logfile " 2>&1 &")))))
(log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
(load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
;; we want the remote server to start in *toppath* so push there
(push-directory areapath)
(debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
(thread-start! log-rotate)
;; host.domain.tld match host?
;; (if (and target-host
;; ;; look at target host, is it host.domain.tld or ip address and does it
;; ;; match current ip or hostname
;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
;; (not (equal? curr-ip target-host)))
;; (begin
;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
;; (setenv "TARGETHOST" target-host)))
;;
(setenv "TARGETHOST_LOGF" logfile)
(thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
(debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
(system (conc "nbfake " cmdln))
(unsetenv "TARGETHOST_LOGF")
;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
(thread-join! log-rotate)
(pop-directory)))
;; given a path to a server log return: host port startseconds server-id
;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let
;; example of what it's looking for in the log file:
;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4
(define (server:logf-get-start-info logf)
(let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id
(dbprep-rx (regexp "^SERVER: dbprep"))
(dbprep-found 0)
(bad-dat (list #f #f #f #f #f)))
(handle-exceptions
exn
(begin
;; WARNING: this is potentially dangerous to blanket ignore the errors
(if (file-exists? logf)
(debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn))
bad-dat) ;; no idea what went wrong, call it a bad server
(with-input-from-file
logf
(lambda ()
(let loop ((inl (read-line))
(lnum 0))
(if (not (eof-object? inl))
(let ((mlst (string-match server-rx inl))
(dbprep (string-match dbprep-rx inl)))
(if dbprep (set! dbprep-found 1))
(if (not mlst)
(if (< lnum 500) ;; give up if more than 500 lines of server log read
(loop (read-line)(+ lnum 1))
(begin
(debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )
bad-dat))
(match mlst
((_ host port start server-id pid)
(list host
(string->number port)
(string->number start)
server-id
(string->number pid)))
(else
(debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst)
bad-dat))))
(begin
(if dbprep-found
(begin
(debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
(thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
(debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
bad-dat))))))))
;; ;; get a list of servers from the log files, with all relevant data
;; ;; ( mod-time host port start-time pid )
;; ;;
;; (define (server:get-list areapath #!key (limit #f))
;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
;; (day-seconds (* 24 60 60)))
;; ;; if the directory exists continue to get the list
;; ;; otherwise attempt to create the logs dir and then
;; ;; continue
;; (if (if (directory-exists? (conc areapath "/logs"))
;; '()
;; (if (file-write-access? areapath)
;; (begin
;; (condition-case
;; (create-directory (conc areapath "/logs") #t)
;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
;; (directory-exists? (conc areapath "/logs")))
;; '()))
;;
;; ;; Get the list of server logs.
;; (let* (
;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers.
;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'")))
;; (server-logs (glob (conc areapath "/logs/server-*-*.log")))
;; (num-serv-logs (length server-logs)))
;; (if (or (null? server-logs) (= num-serv-logs 0))
;; (let ()
;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time))
;; '()
;; )
;; (let loop ((hed (string-chomp (car server-logs)))
;; (tal (cdr server-logs))
;; (res '()))
;; (let* ((mod-time (handle-exceptions
;; exn
;; (begin
;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn)
;; (current-seconds)) ;; 0
;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
;; (down-time (- (current-seconds) mod-time))
;; (serv-dat (if (or (< num-serv-logs 10)
;; (< down-time 900)) ;; day-seconds))
;; (server:logf-get-start-info hed)
;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
;; (serv-rec (cons mod-time serv-dat))
;; (fmatch (string-match fname-rx hed))
;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f))
;; (new-res (if (null? serv-dat)
;; res
;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let
;; (if (null? tal)
;; (if (and limit
;; (> (length new-res) limit))
;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work
;; new-res)
;; (loop (string-chomp (car tal)) (cdr tal) new-res)))))))))
#;(define (server:get-num-alive srvlst)
(let ((num-alive 0))
(for-each
(lambda (server)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn))
(match-let (((mod-time host port start-time server-id pid)
server))
(let* ((uptime (- (current-seconds) mod-time))
(runtime (if start-time
(- mod-time start-time)
0)))
(if (< uptime 5)(set! num-alive (+ num-alive 1)))))))
srvlst)
num-alive))
;; ;; given a list of servers get a list of valid servers, i.e. at least
;; ;; 10 seconds old, has started and is less than 1 hour old and is
;; ;; active (i.e. mod-time < 10 seconds
;; ;;
;; ;; mod-time host port start-time pid
;; ;;
;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off
;; ;; and servers should stick around for about two hours or so.
;; ;;
;; (define (server:get-best srvlst)
;; (let* ((nums (server:get-num-servers))
;; (now (current-seconds))
;; (slst (sort
;; (filter (lambda (rec)
;; (if (and (list? rec)
;; (> (length rec) 2))
;; (let ((start-time (list-ref rec 3))
;; (mod-time (list-ref rec 0)))
;; ;; (print "start-time: " start-time " mod-time: " mod-time)
;; (and start-time mod-time
;; (> (- now start-time) 0) ;; been running at least 0 seconds
;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds
;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set
;; (< (- now start-time)
;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime"))
;; 180)
;; (random 360)))) ;; under one hour running time +/- 180
;; ))
;; #f))
;; srvlst)
;; (lambda (a b)
;; (< (list-ref a 3)
;; (list-ref b 3))))))
;; (if (> (length slst) nums)
;; (take slst nums)
;; slst)))
;; ;; switch from server:get-list to server:get-servers-info
;; ;;
;; (define (server:get-first-best areapath)
;; (let ((srvrs (server:get-best (server:get-list areapath))))
;; (if (and srvrs
;; (not (null? srvrs)))
;; (car srvrs)
;; #f)))
;;
;; (define (server:get-rand-best areapath)
;; (let ((srvrs (server:get-best (server:get-list areapath))))
;; (if (and (list? srvrs)
;; (not (null? srvrs)))
;; (let* ((len (length srvrs))
;; (idx (random len)))
;; (list-ref srvrs idx))
;; #f)))
(define (server:record->id servr)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn)
#f)
(match-let (((host port start-time server-id pid)
servr))
(if server-id
server-id
#f))))
(define (server:record->url servr)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn)
#f)
(match-let (((host port start-time server-id pid)
servr))
(if (and host port)
(conc host ":" port)
#f))))
;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough.
;; if it is old enough, overwrite it and wait 0.25 seconds.
;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively.
;;
#;(define (server:wait-for-server-start-last-flag areapath)
(let* ((start-flag (conc areapath "/logs/server-start-last"))
;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
(idletime (configf:lookup-number *configdat* "server" "idletime" default: 4))
(server-key (conc (get-host-name) "-" (current-process-id))))
(if (file-exists? start-flag)
(let* ((fmodtime (file-modification-time start-flag))
(delta (- (current-seconds) fmodtime))
(old-enough (> delta idletime))
(new-server-key ""))
;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t.
;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
(if (and old-enough
(begin
(debug:print-info 2 *default-log-port* "Writing " start-flag)
(with-output-to-file start-flag (lambda () (print server-key)))
(thread-sleep! 0.25)
(set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))
(equal? server-key new-server-key)))
#t
;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time> + 1, then call this function recursively.
(begin
(debug:print-info 0 *default-log-port* "Gating server start, last start: "
(seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
(thread-sleep! ( + 1 idletime))
(server:wait-for-server-start-last-flag areapath)))))))
;; oldest server alive determines host then choose random of youngest
;; five servers on that host
;;
(define (server:get-servers-info areapath)
;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.")
(let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo")))
(if (not (file-exists? servinfodir))
(create-directory servinfodir))
(let* ((allfiles (glob (conc servinfodir"/*")))
(res (make-hash-table)))
(for-each
(lambda (f)
(let* ((hostport (pathname-strip-directory f))
(serverdat (server:logf-get-start-info f)))
(match serverdat
((host port start server-id pid)
(if (and host port start server-id pid)
(hash-table-set! res hostport serverdat)
(debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))
(else
(debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))))
allfiles)
res)))
;; check the .servinfo directory, are there other servers running on this
;; or another host?
;;
;; returns #t => ok to start another server
;; #f => not ok to start another server
;;
(define (server:minimal-check areapath)
(server:clean-up-old areapath)
(let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo"))
(servrs (glob (conc srvdir"/*")))
(thishostip (server:get-best-guess-address (get-host-name)))
(thisservrs (glob (conc srvdir"/"thishostip":*")))
(homehostinf (server:choose-server areapath 'homehost))
(havehome (car homehostinf))
(wearehome (cdr homehostinf)))
(debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome
", numservers: "(length thisservrs))
(cond
((not havehome) #t) ;; no homehost yet, go for it
((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another
((and havehome (not wearehome)) #f) ;; we are not the home host
((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running
(else
(debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs)
#t))))
(define server-last-start 0)
;; oldest server alive determines host then choose random of youngest
;; five servers on that host
;;
;; mode:
;; best - get best server (random of newest five)
;; home - get home host based on oldest server
;; info - print info
(define (server:choose-server areapath #!optional (mode 'best))
;; age is current-starttime
;; find oldest alive
;; 1. sort by age ascending and ping until good
;; find alive rand from youngest
;; 1. sort by age descending
;; 2. take five
;; 3. check alive, discard if not and repeat
;; first we clean up old server files
(assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode))
(server:clean-up-old areapath)
(let* ((since-last (- (current-seconds) server-last-start))
(server-start-delay 10))
(if ( < (- (current-seconds) server-last-start) 10 )
(begin
(debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
(debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds")
(thread-sleep! server-start-delay)
)
(debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
)
)
(let* ((serversdat (server:get-servers-info areapath))
(servkeys (hash-table-keys serversdat))
(by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last
(sort servkeys ;; list of "host:port"
(lambda (a b)
(>= (list-ref (hash-table-ref serversdat a) 2)
(list-ref (hash-table-ref serversdat b) 2))))
'())))
(debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat)
(debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys)
(if (not (null? by-time-asc))
(let* ((oldest (last by-time-asc))
(oldest-dat (hash-table-ref serversdat oldest))
(host (list-ref oldest-dat 0))
(all-valid (filter (lambda (x)
(equal? host (list-ref (hash-table-ref serversdat x) 0)))
by-time-asc))
(best-ten (lambda ()
(if (> (length all-valid) 11)
(take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out
(if (> (length all-valid) 8)
(drop-right all-valid 1)
all-valid))))
(names->dats (lambda (names)
(map (lambda (x)
(hash-table-ref serversdat x))
names)))
(am-home? (lambda ()
(let* ((currhost (get-host-name))
(bestadrs (server:get-best-guess-address currhost)))
(or (equal? host currhost)
(equal? host bestadrs))))))
(case mode
((info)
(debug:print 0 *default-log-port* "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
(debug:print 0 *default-log-port* "youngest: "(hash-table-ref serversdat (car all-valid))))
((home) host)
((homehost) (cons host (am-home?))) ;; shut up old code
((home?) (am-home?))
((best-ten)(names->dats (best-ten)))
((all-valid)(names->dats all-valid))
((best) (let* ((best-ten (best-ten))
(len (length best-ten)))
(hash-table-ref serversdat (list-ref best-ten (random len)))))
((count)(length all-valid))
(else
(debug:print 0 *default-log-port* "ERROR: invalid command "mode)
#f)))
(begin
(server:run areapath)
(set! server-last-start (current-seconds))
;; (thread-sleep! 3)
(case mode
((homehost) (cons #f #f))
(else #f))))))
(define (server:get-servinfo-dir areapath)
(let* ((spath (conc areapath"/.servinfo")))
(if (not (file-exists? spath))
(create-directory spath #t))
spath))
(define (server:clean-up-old areapath)
;; any server file that has not been touched in ten minutes is effectively dead
(let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*"))))
(for-each
(lambda (sfile)
(let* ((modtime (handle-exceptions
exn
(begin
(debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile)
(current-seconds))
(file-modification-time sfile))))
(if (and (number? modtime)
(> (- (current-seconds) modtime)
600))
(begin
(debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.")
(handle-exceptions
exn
(debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile)
(delete-file sfile))))))
sfiles)))
;; would like to eventually get rid of this
;;
(define (common:on-homehost?)
(if (eq? (rmt:transport-mode) 'http)
(server:choose-server *toppath* 'home?)
#t)) ;; there is no homehost for tcp and nfs is always on home so #t should work
;; kind start up of server, wait before allowing another server for a given
;; area to be launched
;;
(define (server:kind-run areapath)
;; look for $MT_RUN_AREA_HOME/logs/server-start-last
;; and wait for it to be at least <server idletime> seconds old
;; (server:wait-for-server-start-last-flag areapath)
(let loop ()
(if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2)
(begin
(if (common:low-noise-print 30 "our-host-load")
(debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server."))
(loop))))
(if (< (server:choose-server areapath 'count) 20)
(server:run areapath))
#;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
(let* ((lock-file (conc areapath "/logs/server-start.lock")))
(let* ((start-flag (conc areapath "/logs/server-start-last")))
(common:simple-file-lock-and-wait lock-file expire-time: 25)
(debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag)
(system (conc "touch " start-flag)) ;; lazy but safe
(server:run areapath)
(thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
(common:simple-file-release-lock lock-file)))
(debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")))
;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
(let ((give-up-time (+ (current-seconds) timeout)))
(let loop ((server-info (server:check-if-running areapath))
(try-num 0))
(if (or server-info
(> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
(server:record->url server-info)
(let* ( (servers (server:choose-server areapath 'all-valid))
(num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0)))
(if (and (> try-num 0) ;; first time through simply wait a little while then try again
(< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one
(server:run areapath))
(thread-sleep! 5)
(loop (server:check-if-running areapath)
(+ try-num 1)))))))
(define (server:get-num-servers #!key (numservers 2))
(let ((ns (string->number
(or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
(or ns numservers)))
;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;; #!key (numservers "2"))
(let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
(servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath))))
(if (or (and servers
(null? servers))
(not servers))
;; (and (list? servers)
;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
#f
(let loop ((hed (car servers))
(tal (cdr servers)))
(let ((res (server:check-server hed)))
(if res
hed
(if (null? tal)
#f
(loop (car tal)(cdr tal)))))))))
;; ping the given server
;;
(define (server:check-server server-record)
(let* ((server-url (server:record->url server-record))
(server-id (server:record->id server-record))
(res (server:ping server-url server-id)))
(if res
server-url
#f)))
(define (server:kill servr)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn)
#f)
(match-let (((hostname port start-time server-id pid)
servr))
(tasks:kill-server hostname pid))))
;; called in megatest.scm, host-port is string hostname:port
;;
;; NOTE: This is NOT called directly from clients as not all transports support a client running
;; in the same process as the server.
;;
(define (server:ping host:port server-id #!key (do-exit #f))
(let* ((host-port (cond
((string? host:port)
(let ((slst (string-split host:port ":")))
(if (eq? (length slst) 2)
(list (car slst)(string->number (cadr slst)))
#f)))
(else
#f))))
(cond
((and (list? host-port)
(eq? (length host-port) 2))
(let* ((myrunremote (make-and-init-remote *toppath*))
(iface (car host-port))
(port (cadr host-port))
(server-dat (client:connect iface port server-id myrunremote))
(login-res (rmt:login-no-auto-client-setup myrunremote)))
(http-transport:close-connections myrunremote)
(if (and (list? login-res)
(car login-res))
(begin
;; (print "LOGIN_OK")
(if do-exit (exit 0))
#t)
(begin
;; (print "LOGIN_FAILED")
(if do-exit (exit 1))
#f))))
(else
(if host:port
(debug:print 0 *default-log-port* "ERROR: bad host:port "host:port))
(if do-exit
(exit 1)
#f)))))
;; run ping in separate process, safest way in some cases
;;
(define (server:ping-server ifaceport)
(with-input-from-pipe
(conc (common:get-megatest-exe) " -ping " ifaceport)
(lambda ()
(let loop ((inl (read-line))
(res "NOREPLY"))
(if (eof-object? inl)
(case (string->symbol res)
((NOREPLY) #f)
((LOGIN_OK) #t)
(else #f))
(loop (read-line) inl))))))
;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
;;
(define (server:login toppath)
(lambda (toppath)
(set! *db-last-access* (current-seconds)) ;; might not be needed.
(if (equal? *toppath* toppath)
#t
#f)))
;; timeout is hms string: 1h 5m 3s, default is 1 minute
;; This is currently broken. Just use the number of hours with no unit.
;; Default is 60 seconds.
;;
(define (server:expiration-timeout)
(let ((tmo (configf:lookup *configdat* "server" "timeout")))
(if (and (string? tmo)
(common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below
(* 3600 (string->number tmo))
600)))
(define (server:get-best-guess-address hostname)
(let ((res #f))
(for-each
(lambda (adr)
(if (not (eq? (u8vector-ref adr 0) 127))
(set! res adr)))
;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME
(vector->list (hostinfo-addresses (hostname->hostinfo hostname))))
(string-intersperse
(map number->string
(u8vector->list
(if res res (hostname->ip hostname)))) ".")))
;; (define server:sync-lock-token "SERVER_SYNC_LOCK")
;; (define (server:release-sync-lock)
;; (db:no-sync-del! *no-sync-db* server:sync-lock-token))
;; (define (server:have-sync-lock?)
;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token))
;; (have-lock? (car have-lock-pair))
;; (lock-time (cdr have-lock-pair))
;; (lock-age (- (current-seconds) lock-time)))
;; (cond
;; (have-lock? #t)
;; ((>lock-age
;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180)))
;; (server:release-sync-lock)
;; (server:have-sync-lock?))
;; (else #f))))
;; moving this here as it needs access to db and cannot be in common.
;;
(define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f))
(debug:print "WARNING: bruteforce-syncer is called but has been disabled!")
(lambda ()
(debug:print "WARNING: bruteforce-syncer is called but has been disabled!"))
#;(let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh
(sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log")))
(tmp-area (common:get-db-tmp-area))
(tmp-db (conc tmp-area "/megatest.db"))
(staging-file (conc *toppath* "/.megatest.db"))
(mtdbfile (conc *toppath* "/megatest.db"))
(lockfile (common:get-sync-lock-filepath))
(sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log))
(sync-cmd (if fork-to-background
(conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"")
sync-cmd-core))
(default-min-intersync-delay 2)
(min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay))
(default-duty-cycle 0.1)
(duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle))
(last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle)
(calculate-off-time (lambda (work-duration duty-cycle)
(* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds)))
(off-time min-intersync-delay) ;; adjusted in closure below.
(do-a-sync
(lambda ()
(BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync)
(let* ((finalres
(let retry-loop ((num-tries 0))
(if (common:simple-file-lock lockfile)
(begin
(cond
((not (or fork-to-background persist-until-sync))
(debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay
" , off-time="off-time" seconds ]")
(thread-sleep! (max off-time min-intersync-delay)))
(else
(debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit...")))
(if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
(common:snapshot-file mtdbfile subdir: ".db-snapshot"))
(delete-file* staging-file)
(let* ((start-time (current-milliseconds))
(res (system sync-cmd))
(dbbackupfile (conc mtdbfile ".backup"))
(res2
(cond
((eq? 0 res )
(handle-exceptions
exn
#f
(if (file-exists? dbbackupfile)
(delete-file* dbbackupfile)
)
(if (eq? 0 (file-size sync-log))
(delete-file* sync-log))
(system (conc "/bin/mv " staging-file " " mtdbfile))
(set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))
(set! off-time (calculate-off-time
last-sync-seconds
(cond
((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1))
duty-cycle)
(else
(debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle)
default-duty-cycle))))
(debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec")
(debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time)
'sync-completed))
(else
(system (conc "/bin/cp "sync-log" "sync-log".fail"))
(debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail")
(if (file-exists? (conc mtdbfile ".backup"))
(system (conc "/bin/cp "mtdbfile ".backup " mtdbfile)))
#f))))
(common:simple-file-release-lock lockfile)
(BB> "released lockfile: " lockfile)
(when (common:file-exists? lockfile)
(BB> "DID NOT ACTUALLY RELEASE LOCKFILE"))
res2) ;; end let
);; end begin
;; else
(cond
(persist-until-sync
(thread-sleep! 1)
(debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)")
(retry-loop (add1 num-tries)))
(else
(thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay)))
(debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.")
'parallel-sync-in-progress))
) ;; end if got lockfile
)
))
(BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres)
finalres)
) ;; end lambda
))
do-a-sync))