;; Copyright 2006-2017, Matthew Welland.
;;
;; This file is part of Megatest.
;;
;; Megatest is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; Megatest is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see <http://www.gnu.org/licenses/>.
;;
;;======================================================================
;;
;; This is the Megatest specific stuff for starting and maintaining a
;; server. Stuff that talks to the server should go in client.scm.
;; General nanomsg stuff (not Megatest specific) should go in the
;; nmsg-transport.scm file.
;;
;;======================================================================
(require-extension (srfi 18) extras tcp s11n)
(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest
directory-utils posix-extras matchable typed-records
pkts)
(use spiffy uri-common intarweb http-client spiffy-request-vars)
(declare (unit server))
(declare (uses common))
(declare (uses db))
(import db)
;; Basic stuff for safely kicking off a server
(declare (uses portlogger))
(import portlogger)
(declare (uses nmsg-transport))
(import nmsg-transport)
;; Might want to bring the daemonizing back
;; (declare (uses daemon))
(include "common_records.scm")
(include "db_records.scm")
;;======================================================================
;; P K T S S T U F F
;;======================================================================
;;======================================================================
;; N A N O M S G B A S E D S E R V E R
;;======================================================================
(defstruct nmsg
(conn #f)
(port #f)
(myaddr #f)
(hosts (make-hash-table))
pkt
pktspec
pktfile
(mutex (make-mutex))
)
;; make it a global? Well, it is local to nmsg module
(define *nmsg-conndat* (make-nmsg))
(nmsg-pktspec-set! *nmsg-conndat*
`((server (hostname . h)
(port . p)
(pid . i)
)))
;; get a port
;; start the nmsg server
;; look for other servers
;; contact other servers and compile list of servers
;; there are two types of server
;; main servers - dashboards, runners and dedicated servers - need pkt
;; passive servers - test executers, step calls, list-runs - no pkt
;;
(define (server:start-nmsg #!optional (force-server-type #f))
(mutex-lock! (nmsg-mutex *nmsg-conndat*))
(let* ((server-type (or force-server-type
(if (args:any? "-run" "-server")
'main
'passive)))
(port-num (portlogger:open-run-close portlogger:find-port))
(nmsg-conn (nmsg:start-server port-num))
(pktspec (nmsg-pktspec *nmsg-conndat*))
(mtdir (or (get-environment-variable "MT_RUN_AREA_HOME")
(if (file-exists? "megatest.config")
(current-directory)
(begin
(print "ERROR: We don't appear to be in a megatest area and MT_RUN_AREA_HOME is not set.")
#f))))
(pktdir (conc mtdir
"/.server-pkts")))
(if (not mtdir)
#f
(begin
(if (not (directory? pktdir))(create-directory pktdir))
;; server is started, now create pkt if needed
(if (eq? server-type 'main)
(begin
(nmsg-pkt-set! *nmsg-conndat*
(write-alist->pkt
pktdir
`((hostname . ,(get-host-name))
(port . ,port-num)
(pid . ,(current-process-id)))
pktspec: pktspec
ptype: 'server))
(nmsg-pktfile-set! *nmsg-conndat* (conc pktdir "/" (nmsg-pkt *nmsg-conndat*) ".pkt"))))
(nmsg-conn-set! *nmsg-conndat* nmsg-conn)
(nmsg-port-set! *nmsg-conndat* port-num)
(mutex-unlock! (nmsg-mutex *nmsg-conndat*))
#t))))
;; Call this to start the actual server
;;
;; start_server
;;
;; mode: '
;;
(define (server:launch mode)
(let ((start-time (current-seconds)))
(server:start-nmsg mode)
(let loop ((dead-time (- (current-seconds) start-time)))
(thread-sleep! 1)
(if (< dead-time 10)
(loop (- (current-seconds) start-time))
(print "Timed out. Exiting")))))
(define (server:shutdown)
(let ((conn (nmsg-conn *nmsg-conndat*))
(pktf (nmsg-pktfile *nmsg-conndat*))
(port (nmsg-port *nmsg-conndat*)))
(if conn
(begin
(if pktf (delete-file* pktf))
(server:send-all "imshuttingdown")
(nmsg:close conn)
(portlogger:open-run-close portlogger:release-port port)))))
(define (server:send-all msg)
#f)
;; look up all pkts and get the server id (the hash), port, host/ip
;;
(define (server:get-all)
'())
(define (server:get-my-best-address)
(ip->string (car (filter (lambda (x)
(not (eq? (u8vector-ref x 0) 127)))
(vector->list (hostinfo-addresses (hostname->hostinfo "zeus")))))))
;; whoami? I am my pkt
;;
(define (server:whoami?)
(nmsg-pkt *nmsg-conndat*))
;;======================================================================
;; S E R V E R U T I L I T I E S
;;======================================================================
;; get a signature for identifing this process
(define (server:get-process-signature)
(conc (get-host-name) " " (current-process-id)))
;; ;; Get the transport
;; (define (server:get-transport)
;; (if *transport-type*
;; *transport-type*
;; (let ((ttype (string->symbol
;; (or (args:get-arg "-transport")
;; (configf:lookup *configdat* "server" "transport")
;; "rpc"))))
;; (set! *transport-type* ttype)
;; ttype)))
;;
;; ;; Generate a unique signature for this server
;; (define (server:mk-signature)
;; (message-digest-string (md5-primitive)
;; (with-output-to-string
;; (lambda ()
;; (write (list (current-directory)
;; (argv)))))))
;;
;; ;; When using zmq this would send the message back (two step process)
;; ;; with spiffy or rpc this simply returns the return data to be returned
;; ;;
;; (define (server:reply return-addr query-sig success/fail result)
;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
;; ;; (send-message pubsock target send-more: #t)
;; ;; (send-message pubsock
;; (case (server:get-transport)
;; ((rpc) (db:obj->string (vector success/fail query-sig result)))
;; ((http) (db:obj->string (vector success/fail query-sig result)))
;; ((fs) result)
;; (else
;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
;; result)))
;;
;; ;; Given a run id start a server process ### NOTE ### > file 2>&1
;; ;; if the run-id is zero and the target-host is set
;; ;; try running on that host
;; ;; incidental: rotate logs in logs/ dir.
;; ;;
;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area
;; (let* ((curr-host (get-host-name))
;; ;; (attempt-in-progress (server:start-attempted? areapath))
;; ;; (dot-server-url (server:check-if-running areapath))
;; (curr-ip (server:get-best-guess-address curr-host))
;; (curr-pid (current-process-id))
;; (homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
;; (target-host (car homehost))
;; (testsuite (common:get-testsuite-name))
;; (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
;; (cmdln (conc (common:get-megatest-exe)
;; " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
;; " -daemonize "
;; "")
;; ;; " -log " logfile
;; " -m testsuite:" testsuite)) ;; (conc " >> " logfile " 2>&1 &")))))
;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread"))
;; (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
;; ;; we want the remote server to start in *toppath* so push there
;; (push-directory areapath)
;; (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
;; (thread-start! log-rotate)
;;
;; ;; host.domain.tld match host?
;; (if (and target-host
;; ;; look at target host, is it host.domain.tld or ip address and does it
;; ;; match current ip or hostname
;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
;; (not (equal? curr-ip target-host)))
;; (begin
;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
;; (setenv "TARGETHOST" target-host)))
;;
;; (setenv "TARGETHOST_LOGF" logfile)
;; (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
;; (system (conc "nbfake " cmdln))
;; (unsetenv "TARGETHOST_LOGF")
;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
;; (thread-join! log-rotate)
;; (pop-directory)))
;;
;; ;; given a path to a server log return: host port startseconds
;; ;;
;; (define (server:logf-get-start-info logf)
;; (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs
;; (handle-exceptions
;; exn
;; (list #f #f #f) ;; no idea what went wrong, call it a bad server
;; (with-input-from-file
;; logf
;; (lambda ()
;; (let loop ((inl (read-line))
;; (lnum 0))
;; (if (not (eof-object? inl))
;; (let ((mlst (string-match rx inl)))
;; (if (not mlst)
;; (if (< lnum 500) ;; give up if more than 500 lines of server log read
;; (loop (read-line)(+ lnum 1))
;; (list #f #f #f))
;; (let ((dat (cdr mlst)))
;; (list (car dat) ;; host
;; (string->number (cadr dat)) ;; port
;; (string->number (caddr dat))))))
;; (list #f #f #f))))))))
;;
;; ;; get a list of servers with all relevant data
;; ;; ( mod-time host port start-time pid )
;; ;;
;; (define (server:get-list areapath #!key (limit #f))
;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
;; (day-seconds (* 24 60 60)))
;; ;; if the directory exists continue to get the list
;; ;; otherwise attempt to create the logs dir and then
;; ;; continue
;; (if (if (directory-exists? (conc areapath "/logs"))
;; '()
;; (if (file-write-access? areapath)
;; (begin
;; (condition-case
;; (create-directory (conc areapath "/logs") #t)
;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list.")))
;; (directory-exists? (conc areapath "/logs")))
;; '()))
;; (let* ((server-logs (glob (conc areapath "/logs/server-*.log")))
;; (num-serv-logs (length server-logs)))
;; (if (null? server-logs)
;; '()
;; (let loop ((hed (car server-logs))
;; (tal (cdr server-logs))
;; (res '()))
;; (let* ((mod-time (handle-exceptions
;; exn
;; (current-seconds) ;; 0
;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
;; (down-time (- (current-seconds) mod-time))
;; (serv-dat (if (or (< num-serv-logs 10)
;; (< down-time 900)) ;; day-seconds))
;; (server:logf-get-start-info hed)
;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
;; (serv-rec (cons mod-time serv-dat))
;; (fmatch (string-match fname-rx hed))
;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f))
;; (new-res (if (null? serv-dat)
;; res
;; (cons (append serv-rec (list pid)) res))))
;; (if (null? tal)
;; (if (and limit
;; (> (length new-res) limit))
;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work
;; new-res)
;; (loop (car tal)(cdr tal) new-res)))))))))
;;
;; (define (server:get-num-alive srvlst)
;; (let ((num-alive 0))
;; (for-each
;; (lambda (server)
;; (match-let (((mod-time host port start-time pid)
;; server))
;; (let* ((uptime (- (current-seconds) mod-time))
;; (runtime (if start-time
;; (- mod-time start-time)
;; 0)))
;; (if (< uptime 5)(set! num-alive (+ num-alive 1))))))
;; srvlst)
;; num-alive))
;;
;; ;; given a list of servers get a list of valid servers, i.e. at least
;; ;; 10 seconds old, has started and is less than 1 hour old and is
;; ;; active (i.e. mod-time < 10 seconds
;; ;;
;; ;; mod-time host port start-time pid
;; ;;
;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off
;; ;; and servers should stick around for about two hours or so.
;; ;;
;; (define (server:get-best srvlst)
;; (let* ((nums (server:get-num-servers))
;; (now (current-seconds))
;; (slst (sort
;; (filter (lambda (rec)
;; (if (and (list? rec)
;; (> (length rec) 2))
;; (let ((start-time (list-ref rec 3))
;; (mod-time (list-ref rec 0)))
;; ;; (print "start-time: " start-time " mod-time: " mod-time)
;; (and start-time mod-time
;; (> (- now start-time) 0) ;; been running at least 0 seconds
;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds
;; (< (- now start-time)
;; (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
;; 180)
;; (random 360))) ;; under one hour running time +/- 180
;; ))
;; #f))
;; srvlst)
;; (lambda (a b)
;; (< (list-ref a 3)
;; (list-ref b 3))))))
;; (if (> (length slst) nums)
;; (take slst nums)
;; slst)))
;;
;; (define (server:get-first-best areapath)
;; (let ((srvrs (server:get-best (server:get-list areapath))))
;; (if (and srvrs
;; (not (null? srvrs)))
;; (car srvrs)
;; #f)))
;;
;; (define (server:get-rand-best areapath)
;; (let ((srvrs (server:get-best (server:get-list areapath))))
;; (if (and (list? srvrs)
;; (not (null? srvrs)))
;; (let* ((len (length srvrs))
;; (idx (random len)))
;; (list-ref srvrs idx))
;; #f)))
;;
;;
;; (define (server:record->url servr)
;; (match-let (((mod-time host port start-time pid)
;; servr))
;; (if (and host port)
;; (conc host ":" port)
;; #f)))
;;
;; (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value.
;; (if *my-client-signature* *my-client-signature*
;; (let ((sig (server:mk-signature)))
;; (set! *my-client-signature* sig)
;; *my-client-signature*)))
;;
;; ;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; ;; run-id to be launched
;; (define (server:kind-run areapath)
;; (if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
;; (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
;; (call-num (car last-run-dat))
;; (when-run (cadr last-run-dat))
;; (run-delay (+ (case call-num
;; ((0) 0)
;; ((1) 20)
;; ((2) 300)
;; (else 600))
;; (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
;; (lock-file (conc areapath "/logs/server-start.lock")))
;; (if (> (- (current-seconds) when-run) run-delay)
;; (begin
;; (common:simple-file-lock-and-wait lock-file expire-time: 15)
;; (server:run areapath)
;; (thread-sleep! 5) ;; don't release the lock for at least a few seconds
;; (common:simple-file-release-lock lock-file)))
;; (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
;;
;; (define (server:start-and-wait areapath #!key (timeout 60))
;; (let ((give-up-time (+ (current-seconds) timeout)))
;; (let loop ((server-url (server:check-if-running areapath))
;; (try-num 0))
;; (if (or server-url
;; (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
;; server-url
;; (let ((num-ok (length (server:get-best (server:get-list areapath)))))
;; (if (and (> try-num 0) ;; first time through simply wait a little while then try again
;; (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one
;; (server:kind-run areapath))
;; (thread-sleep! 5)
;; (loop (server:check-if-running areapath)
;; (+ try-num 1)))))))
;;
;; (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.
;;
;; (define (server:get-num-servers #!key (numservers 2))
;; (let ((ns (string->number
;; (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
;; (or ns numservers)))
;;
;; ;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;; ;;
;; (define (server:check-if-running areapath) ;; #!key (numservers "2"))
;; (let* ((ns (server:get-num-servers))
;; (servers (server:get-best (server:get-list areapath))))
;; ;; (print "servers: " servers " ns: " ns)
;; (if (or (and servers
;; (null? servers))
;; (not servers)
;; (and (list? servers)
;; (< (length servers) (random ns)))) ;; somewhere between 0 and numservers
;; #f
;; (let loop ((hed (car servers))
;; (tal (cdr servers)))
;; (let ((res (server:check-server hed)))
;; (if res
;; res
;; (if (null? tal)
;; #f
;; (loop (car tal)(cdr tal)))))))))
;;
;; ;; ping the given server
;; ;;
;; (define (server:check-server server-record)
;; (let* ((server-url (server:record->url server-record))
;; (res (case *transport-type*
;; ((http)(server:ping server-url))
;; ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
;; )))
;; (if res
;; server-url
;; #f)))
;;
;; ;; DO STUFF HERE - BUG
;; (define (server:kill servr)
;; (match-let (((mod-time hostname port start-time pid)
;; servr))
;; #;(tasks:kill-server hostname pid)
;; #f
;; ))
;;
;; ;; called in megatest.scm, host-port is string hostname:port
;; ;;
;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running
;; ;; in the same process as the server.
;; ;;
;; (define (server:ping host-port-in #!key (do-exit #f))
;; (let ((host:port (if (not host-port-in) ;; use read-dotserver to find
;; #f ;; (server:check-if-running *toppath*)
;; ;; (if (number? host-port-in) ;; we were handed a server-id
;; ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
;; ;; ;; (print "srec: " srec " host-port-in: " host-port-in)
;; ;; (if srec
;; ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4))
;; ;; (conc "no such server-id " host-port-in)))
;; host-port-in))) ;; )
;; (let* ((host-port (if host:port
;; (let ((slst (string-split host:port ":")))
;; (if (eq? (length slst) 2)
;; (list (car slst)(string->number (cadr slst)))
;; #f))
;; #f)))
;; ;; (toppath (launch:setup)))
;; ;; (print "host-port=" host-port)
;; (if (not host-port)
;; (begin
;; (if host-port-in
;; (debug:print 0 *default-log-port* "ERROR: bad host:port"))
;; (if do-exit (exit 1))
;; #f)
;; (let* ((iface (car host-port))
;; (port (cadr host-port))
;; (server-dat (http-transport:client-connect iface port))
;; (login-res (rmt:login-no-auto-client-setup server-dat)))
;; (if (and (list? login-res)
;; (car login-res))
;; (begin
;; ;; (print "LOGIN_OK")
;; (if do-exit (exit 0))
;; #t)
;; (begin
;; ;; (print "LOGIN_FAILED")
;; (if do-exit (exit 1))
;; #f)))))))
;;
;; ;; run ping in separate process, safest way in some cases
;; ;;
;; (define (server:ping-server ifaceport)
;; (with-input-from-pipe
;; (conc (common:get-megatest-exe) " -ping " ifaceport)
;; (lambda ()
;; (let loop ((inl (read-line))
;; (res "NOREPLY"))
;; (if (eof-object? inl)
;; (case (string->symbol res)
;; ((NOREPLY) #f)
;; ((LOGIN_OK) #t)
;; (else #f))
;; (loop (read-line) inl))))))
;;
;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
;; ;;
;; (define (server:login toppath)
;; (lambda (toppath)
;; (set! *db-last-access* (current-seconds)) ;; might not be needed.
;; (if (equal? *toppath* toppath)
;; #t
;; #f)))
;;
;; ;; timeout is hms string: 1h 5m 3s, default is 1 minute
;; ;;
;; (define (server:expiration-timeout)
;; (let ((tmo (configf:lookup *configdat* "server" "timeout")))
;; (if (and (string? tmo)
;; (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below
;; (* 3600 (string->number tmo))
;; 60)))
;;
;; (define (server:get-best-guess-address hostname)
;; (let ((res #f))
;; (for-each
;; (lambda (adr)
;; (if (not (eq? (u8vector-ref adr 0) 127))
;; (set! res adr)))
;; ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME
;; (vector->list (hostinfo-addresses (hostname->hostinfo hostname))))
;; (string-intersperse
;; (map number->string
;; (u8vector->list
;; (if res res (hostname->ip hostname)))) ".")))
;;
;; ;; moving this here as it needs access to db and cannot be in common.
;; ;;
;; (define (server:writable-watchdog dbstruct)
;; (thread-sleep! 0.05) ;; delay for startup
;; (let ((legacy-sync (common:run-sync?))
;; (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
;; (debug-mode (debug:debug-mode 1))
;; (last-time (current-seconds))
;; (no-sync-db (db:open-no-sync-db))
;; (sync-duration 0) ;; run time of the sync in milliseconds
;; ;;(this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))
;; )
;; (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls
;; (debug:print-info 2 *default-log-port* "Periodic sync thread started.")
;; (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id) );; " this-wd-num="this-wd-num)
;; (if (and legacy-sync (not *time-to-exit*))
;; (let* (;;(dbstruct (db:setup))
;; (mtdb (dbr:dbstruct-mtdb dbstruct))
;; (mtpath (db:dbdat-get-path mtdb))
;; (tmp-area (common:get-db-tmp-area))
;; (start-file (conc tmp-area "/.start-sync"))
;; (end-file (conc tmp-area "/.end-sync")))
;; (debug:print-info 0 *default-log-port* "Server running, periodic sync started.")
;; (let loop ()
;; ;; sync for filesystem local db writes
;; ;;
;; (mutex-lock! *db-multi-sync-mutex*)
;; (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write
;; (sync-in-progress *db-sync-in-progress*)
;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5))
;; (should-sync (and (not *time-to-exit*)
;; (> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed
;; (start-time (current-seconds))
;; (cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)))
;; (mt-mod-time (file-modification-time mtpath))
;; (last-sync-start (if (common:file-exists? start-file)
;; (file-modification-time start-file)
;; 0))
;; (last-sync-end (if (common:file-exists? end-file)
;; (file-modification-time end-file)
;; 10))
;; (sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period
;; (recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db!
;; (< mt-mod-time last-sync-start)))
;; (sync-done (<= last-sync-start last-sync-end))
;; (sync-stale (> start-time (+ last-sync-start sync-stale-seconds)))
;; (will-sync (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting
;; (or need-sync should-sync)
;; (or sync-done sync-stale)
;; (not sync-in-progress)
;; (not recently-synced))))
;; (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress
;; " should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync
;; " sync-done=" sync-done " sync-period=" sync-period)
;; (if (and (> sync-period 5)
;; (common:low-noise-print 30 "sync-period"))
;; (debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds."))
;; ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced))
;; ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync)
;; (if will-sync (set! *db-sync-in-progress* #t))
;; (mutex-unlock! *db-multi-sync-mutex*)
;; (if will-sync
;; (let (;; (max-sync-duration (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK!
;; (sync-start (current-milliseconds)))
;; (with-output-to-file start-file (lambda ()(print (current-process-id))))
;;
;; ;; put lock here
;;
;; ;; (if (or (not max-sync-duration)
;; ;; (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally
;; (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
;; (set! sync-duration (- (current-milliseconds) sync-start))
;; (if (> res 0) ;; some records were transferred, keep the db alive
;; (begin
;; (mutex-lock! *heartbeat-mutex*)
;; (set! *db-last-access* (current-seconds))
;; (mutex-unlock! *heartbeat-mutex*)
;; (debug:print-info 0 *default-log-port* "sync called, " res " records transferred."))
;; (debug:print-info 2 *default-log-port* "sync called but zero records transferred")))))
;; ;; ;; TODO: factor this next routine out into a function
;; ;; (with-input-from-pipe ;; this should not block other threads but need to verify this
;; ;; (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*)
;; ;; (lambda ()
;; ;; (let loop ((inl (read-line))
;; ;; (res #f))
;; ;; (if (eof-object? inl)
;; ;; (begin
;; ;; (set! sync-duration (- (current-milliseconds) sync-start))
;; ;; (cond
;; ;; ((not res)
;; ;; (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\""))
;; ;; ((> res 0)
;; ;; (mutex-lock! *heartbeat-mutex*)
;; ;; (set! *db-last-access* (current-seconds))
;; ;; (mutex-unlock! *heartbeat-mutex*))))
;; ;; (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl)))
;; ;; (if matches
;; ;; (string->number (cadr matches))
;; ;; #f))))
;; ;; (loop (read-line)
;; ;; (or num-synced res))))))))))
;; (if will-sync
;; (begin
;; (mutex-lock! *db-multi-sync-mutex*)
;; (set! *db-sync-in-progress* #f)
;; (set! *db-last-sync* start-time)
;; (with-output-to-file end-file (lambda ()(print (current-process-id))))
;;
;; ;; release lock here
;;
;; (mutex-unlock! *db-multi-sync-mutex*)))
;; (if (and debug-mode
;; (> (- start-time last-time) 60))
;; (begin
;; (set! last-time start-time)
;; (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))
;;
;; ;; keep going unless time to exit
;; ;;
;; (if (not *time-to-exit*)
;; (let delay-loop ((count 0))
;; ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*)
;;
;; (if (and (not *time-to-exit*)
;; (< count 6)) ;; was 11, changing to 4.
;; (begin
;; (thread-sleep! 1)
;; (delay-loop (+ count 1))))
;; (if (not *time-to-exit*) (loop))))
;; ;; time to exit, close the no-sync db here
;; (db:no-sync-close-db no-sync-db)
;; (if (common:low-noise-print 30)
;; (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num)))))))
;;