Artifact 1a43f0a48b6449d96ee176335cb35e5ed9126973:
- File
server.scm
— part of check-in
[67b18cc991]
at
2023-01-16 10:00:35
on branch v1.80-refactor
— Major refactor/clean up of client connection stuff. Removed old crufty approach of having both client and server side calls in http-transport. Plan would be to eventually get rid of http-transport.
Open files slowly grows. bad argument type in http-transport. Line 285 (close-connection! fullurl) is wrong. (user: matt, size: 39030) [annotate] [blame] [check-ins using] [more...]
;; Copyright 2006-2017, Matthew Welland. ;; ;; This file is part of Megatest. ;; ;; Megatest is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or ;; (at your option) any later version. ;; ;; Megatest is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see <http://www.gnu.org/licenses/>. ;; (require-extension (srfi 18) extras tcp s11n) (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest directory-utils posix-extras matchable utils) (use spiffy uri-common intarweb http-client spiffy-request-vars) (declare (unit server)) (declare (uses commonmod)) (declare (uses common)) (declare (uses db)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. ;; (declare (uses synchash)) (declare (uses http-transport)) ;;(declare (uses rpc-transport)) (declare (uses launch)) ;; (declare (uses daemon)) (import commonmod) (include "common_records.scm") (include "db_records.scm") (define (server:make-server-url hostport) (if (not hostport) #f (conc "http://" (car hostport) ":" (cadr hostport)))) (define *server-loop-heart-beat* (current-seconds)) ;;====================================================================== ;; P K T S S T U F F ;;====================================================================== ;; ??? ;;====================================================================== ;; P K T S S T U F F ;;====================================================================== ;; ??? ;;====================================================================== ;; S E R V E R ;;====================================================================== ;; Call this to start the actual server ;; ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== ;; Get the transport (define (server:get-transport) (if *transport-type* *transport-type* (let ((ttype (string->symbol (or (args:get-arg "-transport") (configf:lookup *configdat* "server" "transport") "rpc")))) (set! *transport-type* ttype) ttype))) ;; Generate a unique signature for this server (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string (lambda () (write (list (current-directory) (current-process-id) (argv))))))) (define (server:get-client-signature) (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic (set! *my-client-signature* sig) *my-client-signature*))) (define (server:get-server-id) (if *server-id* *server-id* (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic (set! *server-id* sig) *server-id*))) ;; When using zmq this would send the message back (two step process) ;; with spiffy or rpc this simply returns the return data to be returned ;; (define (server:reply return-addr query-sig success/fail result) (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) ;; (send-message pubsock target send-more: #t) ;; (send-message pubsock (case (server:get-transport) ((rpc) (db:obj->string (vector success/fail query-sig result))) ((http) (db:obj->string (vector success/fail query-sig result))) ((fs) result) (else (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) result))) ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area (let* ((testsuite (common:get-testsuite-name)) (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") "")) (cmdln (conc (common:get-megatest-exe) " -server - ";; (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") " -daemonize " "") ;; " -log " logfile " -m testsuite:" testsuite " " profile-mode )) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) ;; host.domain.tld match host? ;; (if (and target-host ;; ;; look at target host, is it host.domain.tld or ip address and does it ;; ;; match current ip or hostname ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) ;; (not (equal? curr-ip target-host))) ;; (begin ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) ;; (setenv "TARGETHOST" target-host))) ;; (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds server-id ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 (define (server:logf-get-start-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0) (bad-dat (list #f #f #f #f #f))) (handle-exceptions exn (begin ;; WARNING: this is potentially dangerous to blanket ignore the errors (if (file-exists? logf) (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) bad-dat) ;; no idea what went wrong, call it a bad server (with-input-from-file logf (lambda () (let loop ((inl (read-line)) (lnum 0)) (if (not (eof-object? inl)) (let ((mlst (string-match server-rx inl)) (dbprep (string-match dbprep-rx inl))) (if dbprep (set! dbprep-found 1)) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) (begin (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) bad-dat)) (match mlst ((_ host port start server-id pid) (list host (string->number port) (string->number start) server-id (string->number pid))) (else (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat)))) (begin (if dbprep-found (begin (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) bad-dat)))))))) ;; ;; get a list of servers from the log files, with all relevant data ;; ;; ( mod-time host port start-time pid ) ;; ;; ;; (define (server:get-list areapath #!key (limit #f)) ;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) ;; (day-seconds (* 24 60 60))) ;; ;; if the directory exists continue to get the list ;; ;; otherwise attempt to create the logs dir and then ;; ;; continue ;; (if (if (directory-exists? (conc areapath "/logs")) ;; '() ;; (if (file-write-access? areapath) ;; (begin ;; (condition-case ;; (create-directory (conc areapath "/logs") #t) ;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) ;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) ;; (directory-exists? (conc areapath "/logs"))) ;; '())) ;; ;; ;; Get the list of server logs. ;; (let* ( ;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. ;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) ;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) ;; (num-serv-logs (length server-logs))) ;; (if (or (null? server-logs) (= num-serv-logs 0)) ;; (let () ;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) ;; '() ;; ) ;; (let loop ((hed (string-chomp (car server-logs))) ;; (tal (cdr server-logs)) ;; (res '())) ;; (let* ((mod-time (handle-exceptions ;; exn ;; (begin ;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) ;; (current-seconds)) ;; 0 ;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted ;; (down-time (- (current-seconds) mod-time)) ;; (serv-dat (if (or (< num-serv-logs 10) ;; (< down-time 900)) ;; day-seconds)) ;; (server:logf-get-start-info hed) ;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at ;; (serv-rec (cons mod-time serv-dat)) ;; (fmatch (string-match fname-rx hed)) ;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) ;; (new-res (if (null? serv-dat) ;; res ;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let ;; (if (null? tal) ;; (if (and limit ;; (> (length new-res) limit)) ;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work ;; new-res) ;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) #;(define (server:get-num-alive srvlst) (let ((num-alive 0)) (for-each (lambda (server) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) (match-let (((mod-time host port start-time server-id pid) server)) (let* ((uptime (- (current-seconds) mod-time)) (runtime (if start-time (- mod-time start-time) 0))) (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) srvlst) num-alive)) ;; ;; given a list of servers get a list of valid servers, i.e. at least ;; ;; 10 seconds old, has started and is less than 1 hour old and is ;; ;; active (i.e. mod-time < 10 seconds ;; ;; ;; ;; mod-time host port start-time pid ;; ;; ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; ;; and servers should stick around for about two hours or so. ;; ;; ;; (define (server:get-best srvlst) ;; (let* ((nums (server:get-num-servers)) ;; (now (current-seconds)) ;; (slst (sort ;; (filter (lambda (rec) ;; (if (and (list? rec) ;; (> (length rec) 2)) ;; (let ((start-time (list-ref rec 3)) ;; (mod-time (list-ref rec 0))) ;; ;; (print "start-time: " start-time " mod-time: " mod-time) ;; (and start-time mod-time ;; (> (- now start-time) 0) ;; been running at least 0 seconds ;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds ;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set ;; (< (- now start-time) ;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) ;; 180) ;; (random 360)))) ;; under one hour running time +/- 180 ;; )) ;; #f)) ;; srvlst) ;; (lambda (a b) ;; (< (list-ref a 3) ;; (list-ref b 3)))))) ;; (if (> (length slst) nums) ;; (take slst nums) ;; slst))) ;; ;; switch from server:get-list to server:get-servers-info ;; ;; ;; (define (server:get-first-best areapath) ;; (let ((srvrs (server:get-best (server:get-list areapath)))) ;; (if (and srvrs ;; (not (null? srvrs))) ;; (car srvrs) ;; #f))) ;; ;; (define (server:get-rand-best areapath) ;; (let ((srvrs (server:get-best (server:get-list areapath)))) ;; (if (and (list? srvrs) ;; (not (null? srvrs))) ;; (let* ((len (length srvrs)) ;; (idx (random len))) ;; (list-ref srvrs idx)) ;; #f))) (define (server:record->id servr) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) #f) (match-let (((host port start-time server-id pid) servr)) (if server-id server-id #f)))) (define (server:record->url servr) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) #f) (match-let (((host port start-time server-id pid) servr)) (if (and host port) (conc host ":" port) #f)))) ;; if server-start-last exists, and wasn't old enough, wait <idle time> + 1, then call this function recursively until it is old enough. ;; if it is old enough, overwrite it and wait 0.25 seconds. ;; if it then has the wrong server key, wait <idle time> + 1 and call this function recursively. ;; #;(define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (old-enough (> delta idletime)) (new-server-key "")) ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t. ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. (if (and old-enough (begin (debug:print-info 2 *default-log-port* "Writing " start-flag) (with-output-to-file start-flag (lambda () (print server-key))) (thread-sleep! 0.25) (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) (equal? server-key new-server-key))) #t ;; If either of the above conditions is not true, print a "Gating server start" message, wait <idle-time> + 1, then call this function recursively. (begin (debug:print-info 0 *default-log-port* "Gating server start, last start: " (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) (thread-sleep! ( + 1 idletime)) (server:wait-for-server-start-last-flag areapath))))))) ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; (define (server:get-servers-info areapath) ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) (if (not (file-exists? servinfodir)) (create-directory servinfodir)) (let* ((allfiles (glob (conc servinfodir"/*"))) (res (make-hash-table))) (for-each (lambda (f) (let* ((hostport (pathname-strip-directory f)) (serverdat (server:logf-get-start-info f))) (match serverdat ((host port start server-id pid) (if (and host port start server-id pid) (hash-table-set! res hostport serverdat) (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) (else (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) allfiles) res))) ;; check the .servinfo directory, are there other servers running on this ;; or another host? ;; ;; returns #t => ok to start another server ;; #f => not ok to start another server ;; (define (server:minimal-check areapath) (server:clean-up-old areapath) (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) (servrs (glob (conc srvdir"/*"))) (thishostip (server:get-best-guess-address (get-host-name))) (thisservrs (glob (conc srvdir"/"thishostip":*"))) (homehostinf (server:choose-server areapath 'homehost)) (havehome (car homehostinf)) (wearehome (cdr homehostinf))) (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome ", numservers: "(length thisservrs)) (cond ((not havehome) #t) ;; no homehost yet, go for it ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another ((and havehome (not wearehome)) #f) ;; we are not the home host ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running (else (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) #t)))) (define server-last-start 0) ;; oldest server alive determines host then choose random of youngest ;; five servers on that host ;; ;; mode: ;; best - get best server (random of newest five) ;; home - get home host based on oldest server ;; info - print info (define (server:choose-server areapath #!optional (mode 'best)) ;; age is current-starttime ;; find oldest alive ;; 1. sort by age ascending and ping until good ;; find alive rand from youngest ;; 1. sort by age descending ;; 2. take five ;; 3. check alive, discard if not and repeat ;; first we clean up old server files (server:clean-up-old areapath) (let* ((since-last (- (current-seconds) server-last-start)) (server-start-delay 10)) (if ( < (- (current-seconds) server-last-start) 10 ) (begin (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") (thread-sleep! server-start-delay) ) (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) ) ) (let* ((serversdat (server:get-servers-info areapath)) (servkeys (hash-table-keys serversdat)) (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last (sort servkeys ;; list of "host:port" (lambda (a b) (>= (list-ref (hash-table-ref serversdat a) 2) (list-ref (hash-table-ref serversdat b) 2)))) '()))) (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) (if (not (null? by-time-asc)) (let* ((oldest (last by-time-asc)) (oldest-dat (hash-table-ref serversdat oldest)) (host (list-ref oldest-dat 0)) (all-valid (filter (lambda (x) (equal? host (list-ref (hash-table-ref serversdat x) 0))) by-time-asc)) (best-ten (lambda () (if (> (length all-valid) 11) (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out (if (> (length all-valid) 8) (drop-right all-valid 1) all-valid)))) (names->dats (lambda (names) (map (lambda (x) (hash-table-ref serversdat x)) names))) (am-home? (lambda () (let* ((currhost (get-host-name)) (bestadrs (server:get-best-guess-address currhost))) (or (equal? host currhost) (equal? host bestadrs)))))) (case mode ((info) (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) (print "youngest: "(hash-table-ref serversdat (car all-valid)))) ((home) host) ((homehost) (cons host (am-home?))) ;; shut up old code ((home?) (am-home?)) ((best-ten)(names->dats (best-ten))) ((all-valid)(names->dats all-valid)) ((best) (let* ((best-ten (best-ten)) (len (length best-ten))) (hash-table-ref serversdat (list-ref best-ten (random len))))) ((count)(length all-valid)) (else (debug:print 0 *default-log-port* "ERROR: invalid command "mode) #f))) (begin (server:run areapath) (set! server-last-start (current-seconds)) ;; (thread-sleep! 3) (case mode ((homehost) (cons #f #f)) (else #f)))))) (define (server:get-servinfo-dir areapath) (let* ((spath (conc areapath"/.servinfo"))) (if (not (file-exists? spath)) (create-directory spath #t)) spath)) (define (server:clean-up-old areapath) ;; any server file that has not been touched in ten minutes is effectively dead (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) (for-each (lambda (sfile) (let* ((modtime (handle-exceptions exn (begin (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) (current-seconds)) (file-modification-time sfile)))) (if (and (number? modtime) (> (- (current-seconds) modtime) 600)) (begin (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") (handle-exceptions exn (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) (delete-file sfile)))))) sfiles))) ;; would like to eventually get rid of this ;; (define (common:on-homehost?) (server:choose-server *toppath* 'home?)) ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least <server idletime> seconds old ;; (server:wait-for-server-start-last-flag areapath) (let loop () (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) (begin (if (common:low-noise-print 30 "our-host-load") (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) (loop)))) (if (< (server:choose-server areapath 'count) 20) (server:run areapath)) #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 25) (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". (common:simple-file-release-lock lock-file))) (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) (if (or server-info (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. (server:record->url server-info) (let* ( (servers (server:choose-server areapath 'all-valid)) (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) (if (and (> try-num 0) ;; first time through simply wait a little while then try again (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one (server:run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath) (+ try-num 1))))))) (define (server:get-num-servers #!key (numservers 2)) (let ((ns (string->number (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) (or ns numservers))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) (if (or (and servers (null? servers)) (not servers)) ;; (and (list? servers) ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res hed (if (null? tal) #f (loop (car tal)(cdr tal))))))))) ;; ping the given server ;; (define (server:check-server server-record) (let* ((server-url (server:record->url server-record)) (server-id (server:record->id server-record)) (res (server:ping server-url server-id))) (if res server-url #f))) (define (server:kill servr) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) #f) (match-let (((mod-time hostname port start-time server-id pid) servr)) (tasks:kill-server hostname pid)))) ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host:port server-id #!key (do-exit #f)) (let* ((host-port (cond ((string? host:port) (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f))) (else #f)))) (cond ((and (list? host-port) (eq? (length host-port) 2)) (let* ((myrunremote (make-remote)) (iface (car host-port)) (port (cadr host-port)) (server-dat (client:connect iface port server-id myrunremote)) (login-res (rmt:login-no-auto-client-setup myrunremote))) (if (and (list? login-res) (car login-res)) (begin ;; (print "LOGIN_OK") (if do-exit (exit 0)) #t) (begin ;; (print "LOGIN_FAILED") (if do-exit (exit 1)) #f)))) (else (if host:port (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) (if do-exit (exit 1) #f))))) ;; run ping in separate process, safest way in some cases ;; (define (server:ping-server ifaceport) (with-input-from-pipe (conc (common:get-megatest-exe) " -ping " ifaceport) (lambda () (let loop ((inl (read-line)) (res "NOREPLY")) (if (eof-object? inl) (case (string->symbol res) ((NOREPLY) #f) ((LOGIN_OK) #t) (else #f)) (loop (read-line) inl)))))) ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). ;; (define (server:login toppath) (lambda (toppath) (set! *db-last-access* (current-seconds)) ;; might not be needed. (if (equal? *toppath* toppath) #t #f))) ;; timeout is hms string: 1h 5m 3s, default is 1 minute ;; This is currently broken. Just use the number of hours with no unit. ;; Default is 60 seconds. ;; (define (server:expiration-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below (* 3600 (string->number tmo)) 600))) (define (server:get-best-guess-address hostname) (let ((res #f)) (for-each (lambda (adr) (if (not (eq? (u8vector-ref adr 0) 127)) (set! res adr))) ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) (string-intersperse (map number->string (u8vector->list (if res res (hostname->ip hostname)))) "."))) ;; (define server:sync-lock-token "SERVER_SYNC_LOCK") ;; (define (server:release-sync-lock) ;; (db:no-sync-del! *no-sync-db* server:sync-lock-token)) ;; (define (server:have-sync-lock?) ;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token)) ;; (have-lock? (car have-lock-pair)) ;; (lock-time (cdr have-lock-pair)) ;; (lock-age (- (current-seconds) lock-time))) ;; (cond ;; (have-lock? #t) ;; ((>lock-age ;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) ;; (server:release-sync-lock) ;; (server:have-sync-lock?)) ;; (else #f)))) ;; moving this here as it needs access to db and cannot be in common. ;; (define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) (debug:print "WARNING: bruteforce-syncer is called but has been disabled!") (lambda () (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")) #;(let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log"))) (tmp-area (common:get-db-tmp-area)) (tmp-db (conc tmp-area "/megatest.db")) (staging-file (conc *toppath* "/.megatest.db")) (mtdbfile (conc *toppath* "/megatest.db")) (lockfile (common:get-sync-lock-filepath)) (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) (sync-cmd (if fork-to-background (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"") sync-cmd-core)) (default-min-intersync-delay 2) (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay)) (default-duty-cycle 0.1) (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle)) (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle) (calculate-off-time (lambda (work-duration duty-cycle) (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds))) (off-time min-intersync-delay) ;; adjusted in closure below. (do-a-sync (lambda () (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync) (let* ((finalres (let retry-loop ((num-tries 0)) (if (common:simple-file-lock lockfile) (begin (cond ((not (or fork-to-background persist-until-sync)) (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay " , off-time="off-time" seconds ]") (thread-sleep! (max off-time min-intersync-delay))) (else (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let* ((start-time (current-milliseconds)) (res (system sync-cmd)) (dbbackupfile (conc mtdbfile ".backup")) (res2 (cond ((eq? 0 res ) (handle-exceptions exn #f (if (file-exists? dbbackupfile) (delete-file* dbbackupfile) ) (if (eq? 0 (file-size sync-log)) (delete-file* sync-log)) (system (conc "/bin/mv " staging-file " " mtdbfile)) (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) (set! off-time (calculate-off-time last-sync-seconds (cond ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) duty-cycle) (else (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) default-duty-cycle)))) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) 'sync-completed)) (else (system (conc "/bin/cp "sync-log" "sync-log".fail")) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") (if (file-exists? (conc mtdbfile ".backup")) (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) #f)))) (common:simple-file-release-lock lockfile) (BB> "released lockfile: " lockfile) (when (common:file-exists? lockfile) (BB> "DID NOT ACTUALLY RELEASE LOCKFILE")) res2) ;; end let );; end begin ;; else (cond (persist-until-sync (thread-sleep! 1) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)") (retry-loop (add1 num-tries))) (else (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay))) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.") 'parallel-sync-in-progress)) ) ;; end if got lockfile ) )) (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres) finalres) ) ;; end lambda )) do-a-sync))