Index: Makefile
==================================================================
--- Makefile
+++ Makefile
@@ -36,13 +36,14 @@
artifacts.scm
all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut
# dbmod.import.o is just a hack here
-mofiles/dbfile.o : mofiles/debugprint.o mofiles/commonmod.o dbmod.import.o
+mofiles/dbfile.o : mofiles/debugprint.o mofiles/commonmod.o # dbmod.import.o
db.o : dbmod.import.o
mofiles/debugprint.o : mofiles/mtargs.o
+mofiles/servermod.o : mofiles/artifacts.o
# ftail.scm rmtmod.scm commonmod.scm removed
# MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm \
# mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm \
# rmtmod.scm apimod.scm
@@ -379,18 +380,18 @@
$(PREFIX)/bin/.$(ARCHSTR)/mtexec $(PREFIX)/bin/mtexec $(PREFIX)/bin/serialize-env \
$(PREFIX)/bin/nbfind $(PREFIX)/bin/mtrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \
$(PREFIX)/bin/mt-old-to-new.sh $(PREFIX)/bin/mt-new-to-old.sh \
$(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun \
$(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/mtutil \
- $(PREFIX)/bin/tcmt $(PREFIX)/share/db/mt-pg.sql \
+ $(PREFIX)/share/db/mt-pg.sql \
$(PREFIX)/share/js/jquery-3.1.0.slim.min.js \
$(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so \
$(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so.0 \
$(PREFIX)/bin/.$(ARCHSTR)/lib/libxcb-xlib.so.0
# $(PREFIX)/bin/.$(ARCHSTR)/ndboard
-# $(PREFIX)/bin/newdashboard
+# $(PREFIX)/bin/newdashboard $(PREFIX)/bin/tcmt
$(PREFIX)/bin/.$(ARCHSTR) :
mkdir -p $(PREFIX)/bin/.$(ARCHSTR)
mkdir -p $(PREFIX)/bin/.$(ARCHSTR)/lib
Index: artifacts/artifacts.scm
==================================================================
--- artifacts/artifacts.scm
+++ artifacts/artifacts.scm
@@ -96,10 +96,12 @@
;; '((foods (fruit . f)
;; (meat . m)))))
;; => "beef"
;;
+;; NOTE: We call artifacts "arfs"
+
(module artifacts
(
;; cards, util and misc
;; sort-cards
;; calc-sha1
@@ -139,10 +141,11 @@
get-value ;; looks up a value given a key in a dartifact
flatten-all ;; merge the list of values from a query which includes a artifact into a flat alist <== really useful!
check-artifact
;; artifact alists
+get-artifact-fname
write-alist->artifact
read-artifact->alist
;; archive database
;; archive-open-db
@@ -1089,15 +1092,18 @@
;;======================================================================
;; Read/write packets to files (convience functions)
;;======================================================================
+(define (get-artifact-fname targdir uuid)
+ (conc targdir "/" uuid ".artifact"))
+
;; write alist to a artifact file
;;
(define (write-alist->artifact targdir dat #!key (artifactspec '())(ptype #f))
(let-values (((uuid artifact)(alist->artifact dat artifactspec ptype: ptype)))
- (with-output-to-file (conc targdir "/" uuid ".artifact")
+ (with-output-to-file (get-artifact-fname targdir uuid)
(lambda ()
(print artifact)))
uuid)) ;; return the uuid
;; read artifact into alist
Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -58,11 +58,12 @@
;; (declare (uses ftail))
;; (import ftail)
(import dbmod
commonmod
- dbfile)
+ dbfile
+ servermod)
(define *db* #f) ;; this is only for the repl, do not use in general!!!!
(include "common_records.scm")
(include "key_records.scm")
@@ -922,13 +923,29 @@
;;======================================================================
;; Server? Start up here.
;;
(if (args:get-arg "-server")
- (let ((tl (launch:setup)))
+ (let* ((tl (launch:setup))
+ (srvdat (server:setup tl))
+ (handler (lambda (dbstruct cmd params)
+ (api:execute-requests dbstruct (if (string? cmd)
+ (string->symbol cmd)
+ cmd)
+ (db:string->obj params)))))
+ (server:set-handler srvdat handler)
+ (srv-obj-to-str-set! srvdat db:obj->string)
+ (srv-str-to-obj-set! srvdat db:string->obj)
+ (srv-dbstruct-set! srvdat (db:setup #t))
+ (thread-join!
+ (thread-start! (make-thread
+ (lambda ()
+ (server:run srvdat)))))
+
;; (server:launch 0 'http)
- (http-transport:launch)
+ ;; (http-transport:launch) ;; NOTE: Need to replace this call
+
(set! *didsomething* #t)))
;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to
;; a specific Megatest area. Detail are being hashed out and this may change.
;;
Index: servermod.scm
==================================================================
--- servermod.scm
+++ servermod.scm
@@ -1,6 +1,6 @@
-;; Copyright 2006-2017, Matthew Welland.
+;; Copyright 2006-2023, Matthew Welland.
;;
;; This file is part of Megatest.
;;
;; Megatest is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
@@ -12,13 +12,14 @@
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see .
-;;
+;;======================================================================
(declare (unit servermod))
+(declare (uses artifacts))
(use md5 message-digest posix typed-records extras)
(module servermod
*
@@ -29,453 +30,163 @@
extras
md5
message-digest
ports
posix
+ srfi-18
typed-records
data-structures
- )
-(define *client-server-id* #f)
+ artifacts
+ )
(defstruct srv
(areapath #f)
(host #f)
(pid #f)
(type #f)
(dir #f)
- )
-;;======================================================================
-;; S E R V E R U T I L I T I E S
-;;======================================================================
-
-;; Generate a unique signature for this server
-(define (mk-signature)
- (message-digest-string (md5-primitive)
- (with-output-to-string
- (lambda ()
- (write (list (current-directory)
- (current-process-id)
- (argv)))))))
-
-(define (get-client-server-id)
- (if *client-server-id* *client-server-id*
- (let ((sig (mk-signature))) ;; clients re-use the server:mk-signature logic
- (set! *client-server-id* sig)
- *client-server-id*)))
-
-;; if srvdat is #f calculate host.pid
-(define (get-host.pid srvdat)
- (if srvdat
- (conc (srv-host srvdat)"."(srv-pid srvdat))
- (conc (get-host-name)"."(current-process-id))))
+ (incoming #f)
+ (dbstruct #f)
+ (handler #f)
+ (obj-to-str #f)
+ (str-to-obj #f)
+ )
;; nearly every process in Megatest (if write access) starts a server so it
;; can receive messages to exit on request
+;; servers have a type, mtserve, dboard, runner, execute? TOO COMPLICATED.
;; one server per run db file would be ideal.
-;; servers have a type, mtserve, dboard, runner, execute
-
-;; mtrah/.servers//./incoming/*.artifact
-;; | `attic
-;; |
-;; (note: not needed? (i)) `outgoing/./*.artifact
-;; | `attic
-;; `.host:port
-
-;; (i) Not needed if it is expected that all processes run a server
+;; mtrah/.servers/./incoming/*.artifact
+;; | `attic
+;; |
+;; `outgoing/./*.artifact
+;; | `attic
+;; `.host:port
;; on exit processes clean up. only mtserv or dboard clean up abandoned records?
+
+;; IDEA: All requests could go into one directory instead of server specific directory - need locking
+;; don't get multiple processing of arfs
;; server:setup - setup the directory
;; server:launch - start a new mtserve process, possibly
;; using a launcher
;; server:run - run the long running thread that monitors
;; the .server area
;; server:exit - shutdown the server and exit
;; server:handle-request - take incoming request, process it, send response
;; back via best or fastest available transport
+
+;; call this with handler that takes dbstruct cmd and params after doing server:setup
+;; and before starting server:run
+;;
+(define (server:set-handler srvdat handler)
+ (srv-handler-set! srvdat handler))
;; set up the server area and return a server struct
;; NOTE: This will need to be gated by write-access
;;
(define (server:setup areapath)
(let* ((srvdat (make-srv
areapath: areapath
host: (get-host-name) ;; likely need to replace with ip address
pid: (current-process-id)
- ;; type: srvtype
))
- (srvdir (conc areapath"/.server/"srvtype"/"(get-host.pid srvdat))))
+ (srvdir (conc areapath"/.server/"(get-host.pid srvdat))))
(srv-dir-set! srvdat srvdir)
+ (srv-incoming-set! srvdat (conc srvdir"/incoming"))
(create-directory srvdir #t)
(for-each (lambda (d)
(create-directory (conc srvdir"/"d)))
'("incoming" "responses"))
srvdat))
+(define *server-keep-running* #f)
+
+;; to cleanly shut the server down set *server-keep-running* to #f
+;;
+(define (server:run srvdat)
+ ;; create server arf
+ ;; put arf in srvdat-dir
+ ;; forever
+ ;; scan incoming dir
+ ;; foreach arf
+ ;; bundle into with-transaction, no-transaction
+ ;; foreach bundle
+ ;; process the request
+ ;; create results arf and write it to clients dir
+ ;; remove in-arf from incoming
+ (let* ((areapath (srv-areapath srvdat))
+ (srvinfod (server:get-servinfo-dir areapath))
+ (myarf (srv->alist srvdat))
+ (myuuid (write-alist->artifact srvinfod myarf ptype: 'S))
+ (arf-fname (get-artifact-fname srvinfod myuuid))
+ (dbstruct (srv-dbstruct srvdat)))
+ (set! *server-keep-running* #t)
+ (let loop ()
+ (let* ((start (current-milliseconds))
+ (res (server:process-incoming srvdat))
+ (delta (- (current-milliseconds) start)))
+ (thread-sleep! (if (> delta 500)
+ 0.1
+ 0.9))
+ (if (or (> res 0) ;; res is the number of requests that were found and processed
+ *server-keep-running*)
+ (loop))))))
+
+;; read arfs from incoming, process them and put result arfs in proper dirs
+;; return number requests found and processed
+;;
+(define (server:process-incoming srvdat)
+ (let* ((srvdir (srv-dir srvdat))
+ (indir (srv-incoming srvdat))
+ (arfs (glob (conc indir"/*.artifacts")))
+ (handler (srv-handler srvdat))
+ (obj->string (srv-obj-to-str srvdat))
+ (dbstruct (srv-dbstruct srvdat)))
+ (let loop ((rem arfs))
+ (if (not (null? arfs))
+ (let* ((arf (car rem))
+ (dat (read-artifact->alist arf))
+ (ruuid (alist-ref 'Z dat))
+ (host (alist-ref 'h dat))
+ (pid (alist-ref 'i dat))
+ (dest (conc srvdir"/"host"."pid"/responses"))
+ (cmd (alist-ref 'c dat))
+ (params (alist-ref 'p dat))
+ (res (handler dbstruct cmd params))
+ (narf `((r . ,(obj->string res))
+ (P . ,ruuid))))
+ (delete-file arf) ;; add ability to save in bundles in archive area
+ (write-alist->artifact dest narf ptype: 'Q)
+ (loop (cdr rem)))))
+ (length arfs)))
+
+;; start a server process (NOT start server in this process)
+;;
;; maybe check load before calling this?
(define (server:launch areapath)
(let* ((logd (conc areapath"/logs"))
(logf (conc logd"/from-"(get-host.pid #f)".log")))
(if (not (file-exists? logd))(create-directory logd #t))
(setenv "NBFAKE_LOG" logf)
(system (conc "nbfake mtserve -start-dir "areapath))))
-;; ;; When using zmq this would send the message back (two step process)
-;; ;; with spiffy or rpc this simply returns the return data to be returned
-;; ;;
-;; (define (server:reply return-addr query-sig success/fail result)
-;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
-;; ;; (send-message pubsock target send-more: #t)
-;; ;; (send-message pubsock
-;; (case (server:get-transport)
-;; ((rpc) (db:obj->string (vector success/fail query-sig result)))
-;; ((http) (db:obj->string (vector success/fail query-sig result)))
-;; ((fs) result)
-;; (else
-;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
-;; result)))
-;;
-;; ;; Given an area path, start a server process ### NOTE ### > file 2>&1
-;; ;; if the target-host is set
-;; ;; try running on that host
-;; ;; incidental: rotate logs in logs/ dir.
-;; ;;
-;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area
-;; (let* ((testsuite (common:get-testsuite-name))
-;; (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
-;; (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
-;; ""))
-;; (cmdln (conc (common:get-megatest-exe)
-;; " -server - ";; (or target-host "-")
-;; (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
-;; " -daemonize "
-;; "")
-;; ;; " -log " logfile
-;; " -m testsuite:" testsuite
-;; " " profile-mode
-;; )) ;; (conc " >> " logfile " 2>&1 &")))))
-;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
-;; (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
-;; ;; we want the remote server to start in *toppath* so push there
-;; (push-directory areapath)
-;; (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
-;; (thread-start! log-rotate)
-;;
-;; ;; host.domain.tld match host?
-;; ;; (if (and target-host
-;; ;; ;; look at target host, is it host.domain.tld or ip address and does it
-;; ;; ;; match current ip or hostname
-;; ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
-;; ;; (not (equal? curr-ip target-host)))
-;; ;; (begin
-;; ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
-;; ;; (setenv "TARGETHOST" target-host)))
-;; ;;
-;; (setenv "TARGETHOST_LOGF" logfile)
-;; (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
-;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
-;; (system (conc "nbfake " cmdln))
-;; (unsetenv "TARGETHOST_LOGF")
-;; ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
-;; (thread-join! log-rotate)
-;; (pop-directory)))
-;;
-;; ;; given a path to a server log return: host port startseconds server-id
-;; ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let
-;; ;; example of what it's looking for in the log file:
-;; ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4
-;;
-;; (define (server:logf-get-start-info logf)
-;; (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id
-;; (dbprep-rx (regexp "^SERVER: dbprep"))
-;; (dbprep-found 0)
-;; (bad-dat (list #f #f #f #f #f)))
-;; (handle-exceptions
-;; exn
-;; (begin
-;; ;; WARNING: this is potentially dangerous to blanket ignore the errors
-;; (if (file-exists? logf)
-;; (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn))
-;; bad-dat) ;; no idea what went wrong, call it a bad server
-;; (with-input-from-file
-;; logf
-;; (lambda ()
-;; (let loop ((inl (read-line))
-;; (lnum 0))
-;; (if (not (eof-object? inl))
-;; (let ((mlst (string-match server-rx inl))
-;; (dbprep (string-match dbprep-rx inl)))
-;; (if dbprep (set! dbprep-found 1))
-;; (if (not mlst)
-;; (if (< lnum 500) ;; give up if more than 500 lines of server log read
-;; (loop (read-line)(+ lnum 1))
-;; (begin
-;; (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )
-;; bad-dat))
-;; (match mlst
-;; ((_ host port start server-id pid)
-;; (list host
-;; (string->number port)
-;; (string->number start)
-;; server-id
-;; (string->number pid)))
-;; (else
-;; (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst)
-;; bad-dat))))
-;; (begin
-;; (if dbprep-found
-;; (begin
-;; (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
-;; (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
-;; (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
-;; bad-dat))))))))
-;;
-;; ;; ;; get a list of servers from the log files, with all relevant data
-;; ;; ;; ( mod-time host port start-time pid )
-;; ;; ;;
-;; ;; (define (server:get-list areapath #!key (limit #f))
-;; ;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
-;; ;; (day-seconds (* 24 60 60)))
-;; ;; ;; if the directory exists continue to get the list
-;; ;; ;; otherwise attempt to create the logs dir and then
-;; ;; ;; continue
-;; ;; (if (if (directory-exists? (conc areapath "/logs"))
-;; ;; '()
-;; ;; (if (file-write-access? areapath)
-;; ;; (begin
-;; ;; (condition-case
-;; ;; (create-directory (conc areapath "/logs") #t)
-;; ;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
-;; ;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
-;; ;; (directory-exists? (conc areapath "/logs")))
-;; ;; '()))
-;; ;;
-;; ;; ;; Get the list of server logs.
-;; ;; (let* (
-;; ;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers.
-;; ;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'")))
-;; ;; (server-logs (glob (conc areapath "/logs/server-*-*.log")))
-;; ;; (num-serv-logs (length server-logs)))
-;; ;; (if (or (null? server-logs) (= num-serv-logs 0))
-;; ;; (let ()
-;; ;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time))
-;; ;; '()
-;; ;; )
-;; ;; (let loop ((hed (string-chomp (car server-logs)))
-;; ;; (tal (cdr server-logs))
-;; ;; (res '()))
-;; ;; (let* ((mod-time (handle-exceptions
-;; ;; exn
-;; ;; (begin
-;; ;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn)
-;; ;; (current-seconds)) ;; 0
-;; ;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
-;; ;; (down-time (- (current-seconds) mod-time))
-;; ;; (serv-dat (if (or (< num-serv-logs 10)
-;; ;; (< down-time 900)) ;; day-seconds))
-;; ;; (server:logf-get-start-info hed)
-;; ;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
-;; ;; (serv-rec (cons mod-time serv-dat))
-;; ;; (fmatch (string-match fname-rx hed))
-;; ;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f))
-;; ;; (new-res (if (null? serv-dat)
-;; ;; res
-;; ;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let
-;; ;; (if (null? tal)
-;; ;; (if (and limit
-;; ;; (> (length new-res) limit))
-;; ;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work
-;; ;; new-res)
-;; ;; (loop (string-chomp (car tal)) (cdr tal) new-res)))))))))
-;;
-;; #;(define (server:get-num-alive srvlst)
-;; (let ((num-alive 0))
-;; (for-each
-;; (lambda (server)
-;; (handle-exceptions
-;; exn
-;; (begin
-;; (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn))
-;; (match-let (((mod-time host port start-time server-id pid)
-;; server))
-;; (let* ((uptime (- (current-seconds) mod-time))
-;; (runtime (if start-time
-;; (- mod-time start-time)
-;; 0)))
-;; (if (< uptime 5)(set! num-alive (+ num-alive 1)))))))
-;; srvlst)
-;; num-alive))
-;;
-;; ;; ;; given a list of servers get a list of valid servers, i.e. at least
-;; ;; ;; 10 seconds old, has started and is less than 1 hour old and is
-;; ;; ;; active (i.e. mod-time < 10 seconds
-;; ;; ;;
-;; ;; ;; mod-time host port start-time pid
-;; ;; ;;
-;; ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off
-;; ;; ;; and servers should stick around for about two hours or so.
-;; ;; ;;
-;; ;; (define (server:get-best srvlst)
-;; ;; (let* ((nums (server:get-num-servers))
-;; ;; (now (current-seconds))
-;; ;; (slst (sort
-;; ;; (filter (lambda (rec)
-;; ;; (if (and (list? rec)
-;; ;; (> (length rec) 2))
-;; ;; (let ((start-time (list-ref rec 3))
-;; ;; (mod-time (list-ref rec 0)))
-;; ;; ;; (print "start-time: " start-time " mod-time: " mod-time)
-;; ;; (and start-time mod-time
-;; ;; (> (- now start-time) 0) ;; been running at least 0 seconds
-;; ;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds
-;; ;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set
-;; ;; (< (- now start-time)
-;; ;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime"))
-;; ;; 180)
-;; ;; (random 360)))) ;; under one hour running time +/- 180
-;; ;; ))
-;; ;; #f))
-;; ;; srvlst)
-;; ;; (lambda (a b)
-;; ;; (< (list-ref a 3)
-;; ;; (list-ref b 3))))))
-;; ;; (if (> (length slst) nums)
-;; ;; (take slst nums)
-;; ;; slst)))
-;;
-;; ;; ;; switch from server:get-list to server:get-servers-info
-;; ;; ;;
-;; ;; (define (server:get-first-best areapath)
-;; ;; (let ((srvrs (server:get-best (server:get-list areapath))))
-;; ;; (if (and srvrs
-;; ;; (not (null? srvrs)))
-;; ;; (car srvrs)
-;; ;; #f)))
-;; ;;
-;; ;; (define (server:get-rand-best areapath)
-;; ;; (let ((srvrs (server:get-best (server:get-list areapath))))
-;; ;; (if (and (list? srvrs)
-;; ;; (not (null? srvrs)))
-;; ;; (let* ((len (length srvrs))
-;; ;; (idx (random len)))
-;; ;; (list-ref srvrs idx))
-;; ;; #f)))
-;;
-;; (define (server:record->id servr)
-;; (handle-exceptions
-;; exn
-;; (begin
-;; (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn)
-;; #f)
-;; (match-let (((host port start-time server-id pid)
-;; servr))
-;; (if server-id
-;; server-id
-;; #f))))
-;;
-;; (define (server:record->url servr)
-;; (handle-exceptions
-;; exn
-;; (begin
-;; (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn)
-;; #f)
-;; (match-let (((host port start-time server-id pid)
-;; servr))
-;; (if (and host port)
-;; (conc host ":" port)
-;; #f))))
-;;
-;;
-;; ;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough.
-;; ;; if it is old enough, overwrite it and wait 0.25 seconds.
-;; ;; if it then has the wrong server key, wait + 1 and call this function recursively.
-;; ;;
-;; #;(define (server:wait-for-server-start-last-flag areapath)
-;; (let* ((start-flag (conc areapath "/logs/server-start-last"))
-;; ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
-;; (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4))
-;; (server-key (conc (get-host-name) "-" (current-process-id))))
-;; (if (file-exists? start-flag)
-;; (let* ((fmodtime (file-modification-time start-flag))
-;; (delta (- (current-seconds) fmodtime))
-;; (old-enough (> delta idletime))
-;; (new-server-key ""))
-;; ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t.
-;; ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
-;; (if (and old-enough
-;; (begin
-;; (debug:print-info 2 *default-log-port* "Writing " start-flag)
-;; (with-output-to-file start-flag (lambda () (print server-key)))
-;; (thread-sleep! 0.25)
-;; (set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))
-;; (equal? server-key new-server-key)))
-;; #t
-;; ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively.
-;; (begin
-;; (debug:print-info 0 *default-log-port* "Gating server start, last start: "
-;; (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
-;;
-;; (thread-sleep! ( + 1 idletime))
-;; (server:wait-for-server-start-last-flag areapath)))))))
-;;
-;; ;; oldest server alive determines host then choose random of youngest
-;; ;; five servers on that host
-;; ;;
-;; (define (server:get-servers-info areapath)
-;; ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.")
-;; (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo")))
-;; (if (not (file-exists? servinfodir))
-;; (create-directory servinfodir))
-;; (let* ((allfiles (glob (conc servinfodir"/*")))
-;; (res (make-hash-table)))
-;; (for-each
-;; (lambda (f)
-;; (let* ((hostport (pathname-strip-directory f))
-;; (serverdat (server:logf-get-start-info f)))
-;; (match serverdat
-;; ((host port start server-id pid)
-;; (if (and host port start server-id pid)
-;; (hash-table-set! res hostport serverdat)
-;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))
-;; (else
-;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))))
-;; allfiles)
-;; res)))
-;;
-;; ;; check the .servinfo directory, are there other servers running on this
-;; ;; or another host?
-;; ;;
-;; ;; returns #t => ok to start another server
-;; ;; #f => not ok to start another server
-;; ;;
-;; (define (server:minimal-check areapath)
-;; (server:clean-up-old areapath)
-;; (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo"))
-;; (servrs (glob (conc srvdir"/*")))
-;; (thishostip (server:get-best-guess-address (get-host-name)))
-;; (thisservrs (glob (conc srvdir"/"thishostip":*")))
-;; (homehostinf (server:choose-server areapath 'homehost))
-;; (havehome (car homehostinf))
-;; (wearehome (cdr homehostinf)))
-;; (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome
-;; ", numservers: "(length thisservrs))
-;; (cond
-;; ((not havehome) #t) ;; no homehost yet, go for it
-;; ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another
-;; ((and havehome (not wearehome)) #f) ;; we are not the home host
-;; ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running
-;; (else
-;; (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs)
-;; #t))))
-;;
-;;
-;; (define server-last-start 0)
-;;
+;;======================================================================
+;; OLD SERVER STUFF BELOW HERE
+;;======================================================================
+
+;; ;; servers start by setting up fs transport
+;; ;; and put a flag file for that ASAP.
+;; ;; they then set up tcp and put a flag file for
+;; ;; that
+;; ;;
+;; (define *client-server-id* #f)
;;
;; ;; oldest server alive determines host then choose random of youngest
;; ;; five servers on that host
;; ;;
;; ;; mode:
@@ -490,21 +201,20 @@
;; ;; 1. sort by age descending
;; ;; 2. take five
;; ;; 3. check alive, discard if not and repeat
;; ;; first we clean up old server files
;; (server:clean-up-old areapath)
-;; (let* ((since-last (- (current-seconds) server-last-start))
-;; (server-start-delay 10))
-;; (if ( < (- (current-seconds) server-last-start) 10 )
-;; (begin
-;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
-;; (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds")
-;; (thread-sleep! server-start-delay)
-;; )
-;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
-;; )
-;; )
+;; ;; (let* ((since-last (- (current-seconds) server-last-start))
+;; ;; (server-start-delay 10))
+;; ;; (if ( < (- (current-seconds) server-last-start) 10 )
+;; ;; (begin
+;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
+;; ;; (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds")
+;; ;; (thread-sleep! server-start-delay)
+;; ;; )
+;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
+;; ;; )
;; (let* ((serversdat (server:get-servers-info areapath))
;; (servkeys (hash-table-keys serversdat))
;; (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last
;; (sort servkeys ;; list of "host:port"
;; (lambda (a b)
@@ -519,15 +229,15 @@
;; (host (list-ref oldest-dat 0))
;; (all-valid (filter (lambda (x)
;; (equal? host (list-ref (hash-table-ref serversdat x) 0)))
;; by-time-asc))
;; (best-ten (lambda ()
-;; (if (> (length all-valid) 11)
-;; (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out
-;; (if (> (length all-valid) 8)
-;; (drop-right all-valid 1)
-;; all-valid))))
+;; (if (> (length all-valid) 11)
+;; (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out
+;; (if (> (length all-valid) 8)
+;; (drop-right all-valid 1)
+;; all-valid))))
;; (names->dats (lambda (names)
;; (map (lambda (x)
;; (hash-table-ref serversdat x))
;; names)))
;; (am-home? (lambda ()
@@ -556,16 +266,29 @@
;; (set! server-last-start (current-seconds))
;; ;; (thread-sleep! 3)
;; (case mode
;; ((homehost) (cons #f #f))
;; (else #f))))))
-;;
-;; (define (server:get-servinfo-dir areapath)
-;; (let* ((spath (conc areapath"/.servinfo")))
-;; (if (not (file-exists? spath))
-;; (create-directory spath #t))
-;; spath))
+
+;;======================================================================
+;; S E R V E R U T I L I T I E S
+;;======================================================================
+
+(define (server:get-servinfo-dir areapath)
+ (let* ((spath (conc areapath"/.servinfo")))
+ (if (not (file-exists? spath))
+ (create-directory spath #t))
+ spath))
+
+;; ;; Generate a unique signature for this server
+;; (define (mk-signature)
+;; (message-digest-string (md5-primitive)
+;; (with-output-to-string
+;; (lambda ()
+;; (write (list (current-directory)
+;; (current-process-id)
+;; (argv)))))))
;;
;; (define (server:clean-up-old areapath)
;; ;; any server file that has not been touched in ten minutes is effectively dead
;; (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*"))))
;; (for-each
@@ -585,309 +308,785 @@
;; exn
;; (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile)
;; (delete-file sfile))))))
;; sfiles)))
;;
-;; ;; would like to eventually get rid of this
-;; ;;
-;; (define (common:on-homehost?)
-;; (server:choose-server *toppath* 'home?))
-;;
-;; ;; kind start up of server, wait before allowing another server for a given
-;; ;; area to be launched
-;; ;;
-;; (define (server:kind-run areapath)
-;; ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
-;; ;; and wait for it to be at least seconds old
-;; ;; (server:wait-for-server-start-last-flag areapath)
-;; (let loop ()
-;; (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2)
-;; (begin
-;; (if (common:low-noise-print 30 "our-host-load")
-;; (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server."))
-;; (loop))))
-;; (if (< (server:choose-server areapath 'count) 20)
-;; (server:run areapath))
-;; #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
-;; (let* ((lock-file (conc areapath "/logs/server-start.lock")))
-;; (let* ((start-flag (conc areapath "/logs/server-start-last")))
-;; (common:simple-file-lock-and-wait lock-file expire-time: 25)
-;; (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag)
-;; (system (conc "touch " start-flag)) ;; lazy but safe
-;; (server:run areapath)
-;; (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
-;; (common:simple-file-release-lock lock-file)))
-;; (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")))
-;;
-;; ;; this one seems to be the general entry point
-;; ;;
-;; (define (server:start-and-wait areapath #!key (timeout 60))
-;; (let ((give-up-time (+ (current-seconds) timeout)))
-;; (let loop ((server-info (server:check-if-running areapath))
-;; (try-num 0))
-;; (if (or server-info
-;; (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
-;; (server:record->url server-info)
-;; (let* ( (servers (server:choose-server areapath 'all-valid))
-;; (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0)))
-;; (if (and (> try-num 0) ;; first time through simply wait a little while then try again
-;; (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one
-;; (server:run areapath))
-;; (thread-sleep! 5)
-;; (loop (server:check-if-running areapath)
-;; (+ try-num 1)))))))
-;;
-;; (define (server:get-num-servers #!key (numservers 2))
-;; (let ((ns (string->number
-;; (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
-;; (or ns numservers)))
-;;
-;; ;; no longer care if multiple servers are started by accident. older servers will drop off in time.
-;; ;;
-;; (define (server:check-if-running areapath) ;; #!key (numservers "2"))
-;; (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
-;; (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath))))
-;; (if (or (and servers
-;; (null? servers))
-;; (not servers))
-;; ;; (and (list? servers)
-;; ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
-;; #f
-;; (let loop ((hed (car servers))
-;; (tal (cdr servers)))
-;; (let ((res (server:check-server hed)))
-;; (if res
-;; hed
-;; (if (null? tal)
-;; #f
-;; (loop (car tal)(cdr tal)))))))))
-;;
-;; ;; ping the given server
-;; ;;
-;; (define (server:check-server server-record)
-;; (let* ((server-url (server:record->url server-record))
-;; (server-id (server:record->id server-record))
-;; (res (server:ping server-url server-id)))
-;; (if res
-;; server-url
-;; #f)))
-;;
-;; (define (server:kill servr)
-;; (handle-exceptions
-;; exn
-;; (begin
-;; (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn)
-;; #f)
-;; (match-let (((mod-time hostname port start-time server-id pid)
-;; servr))
-;; (tasks:kill-server hostname pid))))
-;;
-;; ;; called in megatest.scm, host-port is string hostname:port
-;; ;;
-;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running
-;; ;; in the same process as the server.
-;; ;;
-;; (define (server:ping host:port server-id #!key (do-exit #f))
-;; (let* ((host-port (cond
-;; ((string? host:port)
-;; (let ((slst (string-split host:port ":")))
-;; (if (eq? (length slst) 2)
-;; (list (car slst)(string->number (cadr slst)))
-;; #f)))
-;; (else
-;; #f))))
-;; (cond
-;; ((and (list? host-port)
-;; (eq? (length host-port) 2))
-;; (let* ((myrunremote (make-remote))
-;; (iface (car host-port))
-;; (port (cadr host-port))
-;; (server-dat (client:connect iface port server-id myrunremote))
-;; (login-res (rmt:login-no-auto-client-setup myrunremote)))
-;; (if (and (list? login-res)
-;; (car login-res))
-;; (begin
-;; ;; (print "LOGIN_OK")
-;; (if do-exit (exit 0))
-;; #t)
-;; (begin
-;; ;; (print "LOGIN_FAILED")
-;; (if do-exit (exit 1))
-;; #f))))
-;; (else
-;; (if host:port
-;; (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port))
-;; (if do-exit
-;; (exit 1)
-;; #f)))))
-;;
-;; ;; run ping in separate process, safest way in some cases
-;; ;;
-;; (define (server:ping-server ifaceport)
-;; (with-input-from-pipe
-;; (conc (common:get-megatest-exe) " -ping " ifaceport)
-;; (lambda ()
-;; (let loop ((inl (read-line))
-;; (res "NOREPLY"))
-;; (if (eof-object? inl)
-;; (case (string->symbol res)
-;; ((NOREPLY) #f)
-;; ((LOGIN_OK) #t)
-;; (else #f))
-;; (loop (read-line) inl))))))
-;;
-;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
-;; ;;
-;; (define (server:login toppath)
-;; (lambda (toppath)
-;; (set! *db-last-access* (current-seconds)) ;; might not be needed.
-;; (if (equal? *toppath* toppath)
-;; #t
-;; #f)))
-;;
-;; ;; timeout is hms string: 1h 5m 3s, default is 1 minute
-;; ;; This is currently broken. Just use the number of hours with no unit.
-;; ;; Default is 60 seconds.
-;; ;;
-;; (define (server:expiration-timeout)
-;; (let ((tmo (configf:lookup *configdat* "server" "timeout")))
-;; (if (and (string? tmo)
-;; (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below
-;; (* 3600 (string->number tmo))
-;; 600)))
-;;
-;; (define (server:get-best-guess-address hostname)
-;; (let ((res #f))
-;; (for-each
-;; (lambda (adr)
-;; (if (not (eq? (u8vector-ref adr 0) 127))
-;; (set! res adr)))
-;; ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME
-;; (vector->list (hostinfo-addresses (hostname->hostinfo hostname))))
-;; (string-intersperse
-;; (map number->string
-;; (u8vector->list
-;; (if res res (hostname->ip hostname)))) ".")))
-;;
-;; ;; (define server:sync-lock-token "SERVER_SYNC_LOCK")
-;; ;; (define (server:release-sync-lock)
-;; ;; (db:no-sync-del! *no-sync-db* server:sync-lock-token))
-;; ;; (define (server:have-sync-lock?)
-;; ;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token))
-;; ;; (have-lock? (car have-lock-pair))
-;; ;; (lock-time (cdr have-lock-pair))
-;; ;; (lock-age (- (current-seconds) lock-time)))
+;; (define (get-client-server-id)
+;; (if *client-server-id* *client-server-id*
+;; (let ((sig (mk-signature))) ;; clients re-use the server:mk-signature logic
+;; (set! *client-server-id* sig)
+;; *client-server-id*)))
+
+;; if srvdat is #f calculate host.pid
+(define (get-host.pid srvdat)
+ (if srvdat
+ (conc (srv-host srvdat)"."(srv-pid srvdat))
+ (conc (get-host-name)"."(current-process-id))))
+
+;; ;; ;; When using zmq this would send the message back (two step process)
+;; ;; ;; with spiffy or rpc this simply returns the return data to be returned
+;; ;; ;;
+;; ;; (define (server:reply return-addr query-sig success/fail result)
+;; ;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
+;; ;; ;; (send-message pubsock target send-more: #t)
+;; ;; ;; (send-message pubsock
+;; ;; (case (server:get-transport)
+;; ;; ((rpc) (db:obj->string (vector success/fail query-sig result)))
+;; ;; ((http) (db:obj->string (vector success/fail query-sig result)))
+;; ;; ((fs) result)
+;; ;; (else
+;; ;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
+;; ;; result)))
+;; ;;
+;; ;; ;; Given an area path, start a server process ### NOTE ### > file 2>&1
+;; ;; ;; if the target-host is set
+;; ;; ;; try running on that host
+;; ;; ;; incidental: rotate logs in logs/ dir.
+;; ;; ;;
+;; ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area
+;; ;; (let* ((testsuite (common:get-testsuite-name))
+;; ;; (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
+;; ;; (profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
+;; ;; ""))
+;; ;; (cmdln (conc (common:get-megatest-exe)
+;; ;; " -server - ";; (or target-host "-")
+;; ;; (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
+;; ;; " -daemonize "
+;; ;; "")
+;; ;; ;; " -log " logfile
+;; ;; " -m testsuite:" testsuite
+;; ;; " " profile-mode
+;; ;; )) ;; (conc " >> " logfile " 2>&1 &")))))
+;; ;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
+;; ;; (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
+;; ;; ;; we want the remote server to start in *toppath* so push there
+;; ;; (push-directory areapath)
+;; ;; (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
+;; ;; (thread-start! log-rotate)
+;; ;;
+;; ;; ;; host.domain.tld match host?
+;; ;; ;; (if (and target-host
+;; ;; ;; ;; look at target host, is it host.domain.tld or ip address and does it
+;; ;; ;; ;; match current ip or hostname
+;; ;; ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
+;; ;; ;; (not (equal? curr-ip target-host)))
+;; ;; ;; (begin
+;; ;; ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
+;; ;; ;; (setenv "TARGETHOST" target-host)))
+;; ;; ;;
+;; ;; (setenv "TARGETHOST_LOGF" logfile)
+;; ;; (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time
+;; ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
+;; ;; (system (conc "nbfake " cmdln))
+;; ;; (unsetenv "TARGETHOST_LOGF")
+;; ;; ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
+;; ;; (thread-join! log-rotate)
+;; ;; (pop-directory)))
+;; ;;
+;; ;; ;; given a path to a server log return: host port startseconds server-id
+;; ;; ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let
+;; ;; ;; example of what it's looking for in the log file:
+;; ;; ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4
+;; ;;
+;; ;; (define (server:logf-get-start-info logf)
+;; ;; (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id
+;; ;; (dbprep-rx (regexp "^SERVER: dbprep"))
+;; ;; (dbprep-found 0)
+;; ;; (bad-dat (list #f #f #f #f #f)))
+;; ;; (handle-exceptions
+;; ;; exn
+;; ;; (begin
+;; ;; ;; WARNING: this is potentially dangerous to blanket ignore the errors
+;; ;; (if (file-exists? logf)
+;; ;; (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn))
+;; ;; bad-dat) ;; no idea what went wrong, call it a bad server
+;; ;; (with-input-from-file
+;; ;; logf
+;; ;; (lambda ()
+;; ;; (let loop ((inl (read-line))
+;; ;; (lnum 0))
+;; ;; (if (not (eof-object? inl))
+;; ;; (let ((mlst (string-match server-rx inl))
+;; ;; (dbprep (string-match dbprep-rx inl)))
+;; ;; (if dbprep (set! dbprep-found 1))
+;; ;; (if (not mlst)
+;; ;; (if (< lnum 500) ;; give up if more than 500 lines of server log read
+;; ;; (loop (read-line)(+ lnum 1))
+;; ;; (begin
+;; ;; (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf )
+;; ;; bad-dat))
+;; ;; (match mlst
+;; ;; ((_ host port start server-id pid)
+;; ;; (list host
+;; ;; (string->number port)
+;; ;; (string->number start)
+;; ;; server-id
+;; ;; (string->number pid)))
+;; ;; (else
+;; ;; (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst)
+;; ;; bad-dat))))
+;; ;; (begin
+;; ;; (if dbprep-found
+;; ;; (begin
+;; ;; (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time))
+;; ;; (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting?
+;; ;; (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))))
+;; ;; bad-dat))))))))
+;; ;;
+;; ;; ;; ;; get a list of servers from the log files, with all relevant data
+;; ;; ;; ;; ( mod-time host port start-time pid )
+;; ;; ;; ;;
+;; ;; ;; (define (server:get-list areapath #!key (limit #f))
+;; ;; ;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$"))
+;; ;; ;; (day-seconds (* 24 60 60)))
+;; ;; ;; ;; if the directory exists continue to get the list
+;; ;; ;; ;; otherwise attempt to create the logs dir and then
+;; ;; ;; ;; continue
+;; ;; ;; (if (if (directory-exists? (conc areapath "/logs"))
+;; ;; ;; '()
+;; ;; ;; (if (file-write-access? areapath)
+;; ;; ;; (begin
+;; ;; ;; (condition-case
+;; ;; ;; (create-directory (conc areapath "/logs") #t)
+;; ;; ;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
+;; ;; ;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
+;; ;; ;; (directory-exists? (conc areapath "/logs")))
+;; ;; ;; '()))
+;; ;; ;;
+;; ;; ;; ;; Get the list of server logs.
+;; ;; ;; (let* (
+;; ;; ;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers.
+;; ;; ;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'")))
+;; ;; ;; (server-logs (glob (conc areapath "/logs/server-*-*.log")))
+;; ;; ;; (num-serv-logs (length server-logs)))
+;; ;; ;; (if (or (null? server-logs) (= num-serv-logs 0))
+;; ;; ;; (let ()
+;; ;; ;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time))
+;; ;; ;; '()
+;; ;; ;; )
+;; ;; ;; (let loop ((hed (string-chomp (car server-logs)))
+;; ;; ;; (tal (cdr server-logs))
+;; ;; ;; (res '()))
+;; ;; ;; (let* ((mod-time (handle-exceptions
+;; ;; ;; exn
+;; ;; ;; (begin
+;; ;; ;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn)
+;; ;; ;; (current-seconds)) ;; 0
+;; ;; ;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
+;; ;; ;; (down-time (- (current-seconds) mod-time))
+;; ;; ;; (serv-dat (if (or (< num-serv-logs 10)
+;; ;; ;; (< down-time 900)) ;; day-seconds))
+;; ;; ;; (server:logf-get-start-info hed)
+;; ;; ;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
+;; ;; ;; (serv-rec (cons mod-time serv-dat))
+;; ;; ;; (fmatch (string-match fname-rx hed))
+;; ;; ;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f))
+;; ;; ;; (new-res (if (null? serv-dat)
+;; ;; ;; res
+;; ;; ;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let
+;; ;; ;; (if (null? tal)
+;; ;; ;; (if (and limit
+;; ;; ;; (> (length new-res) limit))
+;; ;; ;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work
+;; ;; ;; new-res)
+;; ;; ;; (loop (string-chomp (car tal)) (cdr tal) new-res)))))))))
+;; ;;
+;; ;; #;(define (server:get-num-alive srvlst)
+;; ;; (let ((num-alive 0))
+;; ;; (for-each
+;; ;; (lambda (server)
+;; ;; (handle-exceptions
+;; ;; exn
+;; ;; (begin
+;; ;; (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn))
+;; ;; (match-let (((mod-time host port start-time server-id pid)
+;; ;; server))
+;; ;; (let* ((uptime (- (current-seconds) mod-time))
+;; ;; (runtime (if start-time
+;; ;; (- mod-time start-time)
+;; ;; 0)))
+;; ;; (if (< uptime 5)(set! num-alive (+ num-alive 1)))))))
+;; ;; srvlst)
+;; ;; num-alive))
+;; ;;
+;; ;; ;; ;; given a list of servers get a list of valid servers, i.e. at least
+;; ;; ;; ;; 10 seconds old, has started and is less than 1 hour old and is
+;; ;; ;; ;; active (i.e. mod-time < 10 seconds
+;; ;; ;; ;;
+;; ;; ;; ;; mod-time host port start-time pid
+;; ;; ;; ;;
+;; ;; ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off
+;; ;; ;; ;; and servers should stick around for about two hours or so.
+;; ;; ;; ;;
+;; ;; ;; (define (server:get-best srvlst)
+;; ;; ;; (let* ((nums (server:get-num-servers))
+;; ;; ;; (now (current-seconds))
+;; ;; ;; (slst (sort
+;; ;; ;; (filter (lambda (rec)
+;; ;; ;; (if (and (list? rec)
+;; ;; ;; (> (length rec) 2))
+;; ;; ;; (let ((start-time (list-ref rec 3))
+;; ;; ;; (mod-time (list-ref rec 0)))
+;; ;; ;; ;; (print "start-time: " start-time " mod-time: " mod-time)
+;; ;; ;; (and start-time mod-time
+;; ;; ;; (> (- now start-time) 0) ;; been running at least 0 seconds
+;; ;; ;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds
+;; ;; ;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set
+;; ;; ;; (< (- now start-time)
+;; ;; ;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime"))
+;; ;; ;; 180)
+;; ;; ;; (random 360)))) ;; under one hour running time +/- 180
+;; ;; ;; ))
+;; ;; ;; #f))
+;; ;; ;; srvlst)
+;; ;; ;; (lambda (a b)
+;; ;; ;; (< (list-ref a 3)
+;; ;; ;; (list-ref b 3))))))
+;; ;; ;; (if (> (length slst) nums)
+;; ;; ;; (take slst nums)
+;; ;; ;; slst)))
+;; ;;
+;; ;; ;; ;; switch from server:get-list to server:get-servers-info
+;; ;; ;; ;;
+;; ;; ;; (define (server:get-first-best areapath)
+;; ;; ;; (let ((srvrs (server:get-best (server:get-list areapath))))
+;; ;; ;; (if (and srvrs
+;; ;; ;; (not (null? srvrs)))
+;; ;; ;; (car srvrs)
+;; ;; ;; #f)))
+;; ;; ;;
+;; ;; ;; (define (server:get-rand-best areapath)
+;; ;; ;; (let ((srvrs (server:get-best (server:get-list areapath))))
+;; ;; ;; (if (and (list? srvrs)
+;; ;; ;; (not (null? srvrs)))
+;; ;; ;; (let* ((len (length srvrs))
+;; ;; ;; (idx (random len)))
+;; ;; ;; (list-ref srvrs idx))
+;; ;; ;; #f)))
+;; ;;
+;; ;; (define (server:record->id servr)
+;; ;; (handle-exceptions
+;; ;; exn
+;; ;; (begin
+;; ;; (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn)
+;; ;; #f)
+;; ;; (match-let (((host port start-time server-id pid)
+;; ;; servr))
+;; ;; (if server-id
+;; ;; server-id
+;; ;; #f))))
+;; ;;
+;; ;; (define (server:record->url servr)
+;; ;; (handle-exceptions
+;; ;; exn
+;; ;; (begin
+;; ;; (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn)
+;; ;; #f)
+;; ;; (match-let (((host port start-time server-id pid)
+;; ;; servr))
+;; ;; (if (and host port)
+;; ;; (conc host ":" port)
+;; ;; #f))))
+;; ;;
+;; ;;
+;; ;; ;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough.
+;; ;; ;; if it is old enough, overwrite it and wait 0.25 seconds.
+;; ;; ;; if it then has the wrong server key, wait + 1 and call this function recursively.
+;; ;; ;;
+;; ;; #;(define (server:wait-for-server-start-last-flag areapath)
+;; ;; (let* ((start-flag (conc areapath "/logs/server-start-last"))
+;; ;; ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
+;; ;; (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4))
+;; ;; (server-key (conc (get-host-name) "-" (current-process-id))))
+;; ;; (if (file-exists? start-flag)
+;; ;; (let* ((fmodtime (file-modification-time start-flag))
+;; ;; (delta (- (current-seconds) fmodtime))
+;; ;; (old-enough (> delta idletime))
+;; ;; (new-server-key ""))
+;; ;; ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t.
+;; ;; ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process.
+;; ;; (if (and old-enough
+;; ;; (begin
+;; ;; (debug:print-info 2 *default-log-port* "Writing " start-flag)
+;; ;; (with-output-to-file start-flag (lambda () (print server-key)))
+;; ;; (thread-sleep! 0.25)
+;; ;; (set! new-server-key (with-input-from-file start-flag (lambda () (read-line))))
+;; ;; (equal? server-key new-server-key)))
+;; ;; #t
+;; ;; ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively.
+;; ;; (begin
+;; ;; (debug:print-info 0 *default-log-port* "Gating server start, last start: "
+;; ;; (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server"))
+;; ;;
+;; ;; (thread-sleep! ( + 1 idletime))
+;; ;; (server:wait-for-server-start-last-flag areapath)))))))
+;; ;;
+;; ;; ;; oldest server alive determines host then choose random of youngest
+;; ;; ;; five servers on that host
+;; ;; ;;
+;; ;; (define (server:get-servers-info areapath)
+;; ;; ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.")
+;; ;; (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo")))
+;; ;; (if (not (file-exists? servinfodir))
+;; ;; (create-directory servinfodir))
+;; ;; (let* ((allfiles (glob (conc servinfodir"/*")))
+;; ;; (res (make-hash-table)))
+;; ;; (for-each
+;; ;; (lambda (f)
+;; ;; (let* ((hostport (pathname-strip-directory f))
+;; ;; (serverdat (server:logf-get-start-info f)))
+;; ;; (match serverdat
+;; ;; ((host port start server-id pid)
+;; ;; (if (and host port start server-id pid)
+;; ;; (hash-table-set! res hostport serverdat)
+;; ;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))
+;; ;; (else
+;; ;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat)))))
+;; ;; allfiles)
+;; ;; res)))
+;; ;;
+;; ;; ;; check the .servinfo directory, are there other servers running on this
+;; ;; ;; or another host?
+;; ;; ;;
+;; ;; ;; returns #t => ok to start another server
+;; ;; ;; #f => not ok to start another server
+;; ;; ;;
+;; ;; (define (server:minimal-check areapath)
+;; ;; (server:clean-up-old areapath)
+;; ;; (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo"))
+;; ;; (servrs (glob (conc srvdir"/*")))
+;; ;; (thishostip (server:get-best-guess-address (get-host-name)))
+;; ;; (thisservrs (glob (conc srvdir"/"thishostip":*")))
+;; ;; (homehostinf (server:choose-server areapath 'homehost))
+;; ;; (havehome (car homehostinf))
+;; ;; (wearehome (cdr homehostinf)))
+;; ;; (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome
+;; ;; ", numservers: "(length thisservrs))
+;; ;; (cond
+;; ;; ((not havehome) #t) ;; no homehost yet, go for it
+;; ;; ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another
+;; ;; ((and havehome (not wearehome)) #f) ;; we are not the home host
+;; ;; ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running
+;; ;; (else
+;; ;; (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs)
+;; ;; #t))))
+;; ;;
+;; ;;
+;; ;; (define server-last-start 0)
+;; ;;
+;; ;;
+;; ;; ;; oldest server alive determines host then choose random of youngest
+;; ;; ;; five servers on that host
+;; ;; ;;
+;; ;; ;; mode:
+;; ;; ;; best - get best server (random of newest five)
+;; ;; ;; home - get home host based on oldest server
+;; ;; ;; info - print info
+;; ;; (define (server:choose-server areapath #!optional (mode 'best))
+;; ;; ;; age is current-starttime
+;; ;; ;; find oldest alive
+;; ;; ;; 1. sort by age ascending and ping until good
+;; ;; ;; find alive rand from youngest
+;; ;; ;; 1. sort by age descending
+;; ;; ;; 2. take five
+;; ;; ;; 3. check alive, discard if not and repeat
+;; ;; ;; first we clean up old server files
+;; ;; (server:clean-up-old areapath)
+;; ;; (let* ((since-last (- (current-seconds) server-last-start))
+;; ;; (server-start-delay 10))
+;; ;; (if ( < (- (current-seconds) server-last-start) 10 )
+;; ;; (begin
+;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
+;; ;; (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds")
+;; ;; (thread-sleep! server-start-delay)
+;; ;; )
+;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start))
+;; ;; )
+;; ;; )
+;; ;; (let* ((serversdat (server:get-servers-info areapath))
+;; ;; (servkeys (hash-table-keys serversdat))
+;; ;; (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last
+;; ;; (sort servkeys ;; list of "host:port"
+;; ;; (lambda (a b)
+;; ;; (>= (list-ref (hash-table-ref serversdat a) 2)
+;; ;; (list-ref (hash-table-ref serversdat b) 2))))
+;; ;; '())))
+;; ;; (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat)
+;; ;; (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys)
+;; ;; (if (not (null? by-time-asc))
+;; ;; (let* ((oldest (last by-time-asc))
+;; ;; (oldest-dat (hash-table-ref serversdat oldest))
+;; ;; (host (list-ref oldest-dat 0))
+;; ;; (all-valid (filter (lambda (x)
+;; ;; (equal? host (list-ref (hash-table-ref serversdat x) 0)))
+;; ;; by-time-asc))
+;; ;; (best-ten (lambda ()
+;; ;; (if (> (length all-valid) 11)
+;; ;; (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out
+;; ;; (if (> (length all-valid) 8)
+;; ;; (drop-right all-valid 1)
+;; ;; all-valid))))
+;; ;; (names->dats (lambda (names)
+;; ;; (map (lambda (x)
+;; ;; (hash-table-ref serversdat x))
+;; ;; names)))
+;; ;; (am-home? (lambda ()
+;; ;; (let* ((currhost (get-host-name))
+;; ;; (bestadrs (server:get-best-guess-address currhost)))
+;; ;; (or (equal? host currhost)
+;; ;; (equal? host bestadrs))))))
+;; ;; (case mode
+;; ;; ((info)
+;; ;; (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid)
+;; ;; (print "youngest: "(hash-table-ref serversdat (car all-valid))))
+;; ;; ((home) host)
+;; ;; ((homehost) (cons host (am-home?))) ;; shut up old code
+;; ;; ((home?) (am-home?))
+;; ;; ((best-ten)(names->dats (best-ten)))
+;; ;; ((all-valid)(names->dats all-valid))
+;; ;; ((best) (let* ((best-ten (best-ten))
+;; ;; (len (length best-ten)))
+;; ;; (hash-table-ref serversdat (list-ref best-ten (random len)))))
+;; ;; ((count)(length all-valid))
+;; ;; (else
+;; ;; (debug:print 0 *default-log-port* "ERROR: invalid command "mode)
+;; ;; #f)))
+;; ;; (begin
+;; ;; (server:run areapath)
+;; ;; (set! server-last-start (current-seconds))
+;; ;; ;; (thread-sleep! 3)
+;; ;; (case mode
+;; ;; ((homehost) (cons #f #f))
+;; ;; (else #f))))))
+;; ;;
+;; ;; (define (server:get-servinfo-dir areapath)
+;; ;; (let* ((spath (conc areapath"/.servinfo")))
+;; ;; (if (not (file-exists? spath))
+;; ;; (create-directory spath #t))
+;; ;; spath))
+;; ;;
+;; ;; (define (server:clean-up-old areapath)
+;; ;; ;; any server file that has not been touched in ten minutes is effectively dead
+;; ;; (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*"))))
+;; ;; (for-each
+;; ;; (lambda (sfile)
+;; ;; (let* ((modtime (handle-exceptions
+;; ;; exn
+;; ;; (begin
+;; ;; (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile)
+;; ;; (current-seconds))
+;; ;; (file-modification-time sfile))))
+;; ;; (if (and (number? modtime)
+;; ;; (> (- (current-seconds) modtime)
+;; ;; 600))
+;; ;; (begin
+;; ;; (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.")
+;; ;; (handle-exceptions
+;; ;; exn
+;; ;; (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile)
+;; ;; (delete-file sfile))))))
+;; ;; sfiles)))
+;; ;;
+;; ;; ;; would like to eventually get rid of this
+;; ;; ;;
+;; ;; (define (common:on-homehost?)
+;; ;; (server:choose-server *toppath* 'home?))
+;; ;;
+;; ;; ;; kind start up of server, wait before allowing another server for a given
+;; ;; ;; area to be launched
+;; ;; ;;
+;; ;; (define (server:kind-run areapath)
+;; ;; ;; look for $MT_RUN_AREA_HOME/logs/server-start-last
+;; ;; ;; and wait for it to be at least seconds old
+;; ;; ;; (server:wait-for-server-start-last-flag areapath)
+;; ;; (let loop ()
+;; ;; (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2)
+;; ;; (begin
+;; ;; (if (common:low-noise-print 30 "our-host-load")
+;; ;; (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server."))
+;; ;; (loop))))
+;; ;; (if (< (server:choose-server areapath 'count) 20)
+;; ;; (server:run areapath))
+;; ;; #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
+;; ;; (let* ((lock-file (conc areapath "/logs/server-start.lock")))
+;; ;; (let* ((start-flag (conc areapath "/logs/server-start-last")))
+;; ;; (common:simple-file-lock-and-wait lock-file expire-time: 25)
+;; ;; (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag)
+;; ;; (system (conc "touch " start-flag)) ;; lazy but safe
+;; ;; (server:run areapath)
+;; ;; (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED".
+;; ;; (common:simple-file-release-lock lock-file)))
+;; ;; (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.")))
+;; ;;
+;; ;; ;; this one seems to be the general entry point
+;; ;; ;;
+;; ;; (define (server:start-and-wait areapath #!key (timeout 60))
+;; ;; (let ((give-up-time (+ (current-seconds) timeout)))
+;; ;; (let loop ((server-info (server:check-if-running areapath))
+;; ;; (try-num 0))
+;; ;; (if (or server-info
+;; ;; (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
+;; ;; (server:record->url server-info)
+;; ;; (let* ( (servers (server:choose-server areapath 'all-valid))
+;; ;; (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0)))
+;; ;; (if (and (> try-num 0) ;; first time through simply wait a little while then try again
+;; ;; (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one
+;; ;; (server:run areapath))
+;; ;; (thread-sleep! 5)
+;; ;; (loop (server:check-if-running areapath)
+;; ;; (+ try-num 1)))))))
+;; ;;
+;; ;; (define (server:get-num-servers #!key (numservers 2))
+;; ;; (let ((ns (string->number
+;; ;; (or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
+;; ;; (or ns numservers)))
+;; ;;
+;; ;; ;; no longer care if multiple servers are started by accident. older servers will drop off in time.
+;; ;; ;;
+;; ;; (define (server:check-if-running areapath) ;; #!key (numservers "2"))
+;; ;; (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed
+;; ;; (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath))))
+;; ;; (if (or (and servers
+;; ;; (null? servers))
+;; ;; (not servers))
+;; ;; ;; (and (list? servers)
+;; ;; ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers
+;; ;; #f
+;; ;; (let loop ((hed (car servers))
+;; ;; (tal (cdr servers)))
+;; ;; (let ((res (server:check-server hed)))
+;; ;; (if res
+;; ;; hed
+;; ;; (if (null? tal)
+;; ;; #f
+;; ;; (loop (car tal)(cdr tal)))))))))
+;; ;;
+;; ;; ;; ping the given server
+;; ;; ;;
+;; ;; (define (server:check-server server-record)
+;; ;; (let* ((server-url (server:record->url server-record))
+;; ;; (server-id (server:record->id server-record))
+;; ;; (res (server:ping server-url server-id)))
+;; ;; (if res
+;; ;; server-url
+;; ;; #f)))
+;; ;;
+;; ;; (define (server:kill servr)
+;; ;; (handle-exceptions
+;; ;; exn
+;; ;; (begin
+;; ;; (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn)
+;; ;; #f)
+;; ;; (match-let (((mod-time hostname port start-time server-id pid)
+;; ;; servr))
+;; ;; (tasks:kill-server hostname pid))))
+;; ;;
+;; ;; ;; called in megatest.scm, host-port is string hostname:port
+;; ;; ;;
+;; ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running
+;; ;; ;; in the same process as the server.
+;; ;; ;;
+;; ;; (define (server:ping host:port server-id #!key (do-exit #f))
+;; ;; (let* ((host-port (cond
+;; ;; ((string? host:port)
+;; ;; (let ((slst (string-split host:port ":")))
+;; ;; (if (eq? (length slst) 2)
+;; ;; (list (car slst)(string->number (cadr slst)))
+;; ;; #f)))
+;; ;; (else
+;; ;; #f))))
;; ;; (cond
-;; ;; (have-lock? #t)
-;; ;; ((>lock-age
-;; ;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180)))
-;; ;; (server:release-sync-lock)
-;; ;; (server:have-sync-lock?))
-;; ;; (else #f))))
-;;
-;; ;; moving this here as it needs access to db and cannot be in common.
-;; ;;
-;;
-;; (define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f))
-;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")
-;; (lambda ()
-;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!"))
-;; #;(let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh
-;; (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log")))
-;; (tmp-area (common:get-db-tmp-area))
-;; (tmp-db (conc tmp-area "/megatest.db"))
-;; (staging-file (conc *toppath* "/.megatest.db"))
-;; (mtdbfile (conc *toppath* "/megatest.db"))
-;; (lockfile (common:get-sync-lock-filepath))
-;; (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log))
-;; (sync-cmd (if fork-to-background
-;; (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"")
-;; sync-cmd-core))
-;; (default-min-intersync-delay 2)
-;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay))
-;; (default-duty-cycle 0.1)
-;; (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle))
-;; (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle)
-;; (calculate-off-time (lambda (work-duration duty-cycle)
-;; (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds)))
-;; (off-time min-intersync-delay) ;; adjusted in closure below.
-;; (do-a-sync
-;; (lambda ()
-;; (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync)
-;; (let* ((finalres
-;; (let retry-loop ((num-tries 0))
-;; (if (common:simple-file-lock lockfile)
-;; (begin
-;; (cond
-;; ((not (or fork-to-background persist-until-sync))
-;; (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay
-;; " , off-time="off-time" seconds ]")
-;; (thread-sleep! (max off-time min-intersync-delay)))
-;; (else
-;; (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit...")))
-;;
-;; (if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
-;; (common:snapshot-file mtdbfile subdir: ".db-snapshot"))
-;; (delete-file* staging-file)
-;; (let* ((start-time (current-milliseconds))
-;; (res (system sync-cmd))
-;; (dbbackupfile (conc mtdbfile ".backup"))
-;; (res2
-;; (cond
-;; ((eq? 0 res )
-;; (handle-exceptions
-;; exn
-;; #f
-;; (if (file-exists? dbbackupfile)
-;; (delete-file* dbbackupfile)
-;; )
-;; (if (eq? 0 (file-size sync-log))
-;; (delete-file* sync-log))
-;; (system (conc "/bin/mv " staging-file " " mtdbfile))
-;;
-;; (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))
-;; (set! off-time (calculate-off-time
-;; last-sync-seconds
-;; (cond
-;; ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1))
-;; duty-cycle)
-;; (else
-;; (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle)
-;; default-duty-cycle))))
-;;
-;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec")
-;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time)
-;; 'sync-completed))
-;; (else
-;; (system (conc "/bin/cp "sync-log" "sync-log".fail"))
-;; (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail")
-;; (if (file-exists? (conc mtdbfile ".backup"))
-;; (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile)))
-;; #f))))
-;; (common:simple-file-release-lock lockfile)
-;; (BB> "released lockfile: " lockfile)
-;; (when (common:file-exists? lockfile)
-;; (BB> "DID NOT ACTUALLY RELEASE LOCKFILE"))
-;; res2) ;; end let
-;; );; end begin
-;; ;; else
-;; (cond
-;; (persist-until-sync
-;; (thread-sleep! 1)
-;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)")
-;; (retry-loop (add1 num-tries)))
-;; (else
-;; (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay)))
-;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.")
-;; 'parallel-sync-in-progress))
-;; ) ;; end if got lockfile
-;; )
-;; ))
-;; (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres)
-;; finalres)
-;; ) ;; end lambda
-;; ))
-;; do-a-sync))
-;;
-;;
+;; ;; ((and (list? host-port)
+;; ;; (eq? (length host-port) 2))
+;; ;; (let* ((myrunremote (make-remote))
+;; ;; (iface (car host-port))
+;; ;; (port (cadr host-port))
+;; ;; (server-dat (client:connect iface port server-id myrunremote))
+;; ;; (login-res (rmt:login-no-auto-client-setup myrunremote)))
+;; ;; (if (and (list? login-res)
+;; ;; (car login-res))
+;; ;; (begin
+;; ;; ;; (print "LOGIN_OK")
+;; ;; (if do-exit (exit 0))
+;; ;; #t)
+;; ;; (begin
+;; ;; ;; (print "LOGIN_FAILED")
+;; ;; (if do-exit (exit 1))
+;; ;; #f))))
+;; ;; (else
+;; ;; (if host:port
+;; ;; (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port))
+;; ;; (if do-exit
+;; ;; (exit 1)
+;; ;; #f)))))
+;; ;;
+;; ;; ;; run ping in separate process, safest way in some cases
+;; ;; ;;
+;; ;; (define (server:ping-server ifaceport)
+;; ;; (with-input-from-pipe
+;; ;; (conc (common:get-megatest-exe) " -ping " ifaceport)
+;; ;; (lambda ()
+;; ;; (let loop ((inl (read-line))
+;; ;; (res "NOREPLY"))
+;; ;; (if (eof-object? inl)
+;; ;; (case (string->symbol res)
+;; ;; ((NOREPLY) #f)
+;; ;; ((LOGIN_OK) #t)
+;; ;; (else #f))
+;; ;; (loop (read-line) inl))))))
+;; ;;
+;; ;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
+;; ;; ;;
+;; ;; (define (server:login toppath)
+;; ;; (lambda (toppath)
+;; ;; (set! *db-last-access* (current-seconds)) ;; might not be needed.
+;; ;; (if (equal? *toppath* toppath)
+;; ;; #t
+;; ;; #f)))
+;; ;;
+;; ;; ;; timeout is hms string: 1h 5m 3s, default is 1 minute
+;; ;; ;; This is currently broken. Just use the number of hours with no unit.
+;; ;; ;; Default is 60 seconds.
+;; ;; ;;
+;; ;; (define (server:expiration-timeout)
+;; ;; (let ((tmo (configf:lookup *configdat* "server" "timeout")))
+;; ;; (if (and (string? tmo)
+;; ;; (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below
+;; ;; (* 3600 (string->number tmo))
+;; ;; 600)))
+;; ;;
+;; ;; (define (server:get-best-guess-address hostname)
+;; ;; (let ((res #f))
+;; ;; (for-each
+;; ;; (lambda (adr)
+;; ;; (if (not (eq? (u8vector-ref adr 0) 127))
+;; ;; (set! res adr)))
+;; ;; ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME
+;; ;; (vector->list (hostinfo-addresses (hostname->hostinfo hostname))))
+;; ;; (string-intersperse
+;; ;; (map number->string
+;; ;; (u8vector->list
+;; ;; (if res res (hostname->ip hostname)))) ".")))
+;; ;;
+;; ;; ;; (define server:sync-lock-token "SERVER_SYNC_LOCK")
+;; ;; ;; (define (server:release-sync-lock)
+;; ;; ;; (db:no-sync-del! *no-sync-db* server:sync-lock-token))
+;; ;; ;; (define (server:have-sync-lock?)
+;; ;; ;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token))
+;; ;; ;; (have-lock? (car have-lock-pair))
+;; ;; ;; (lock-time (cdr have-lock-pair))
+;; ;; ;; (lock-age (- (current-seconds) lock-time)))
+;; ;; ;; (cond
+;; ;; ;; (have-lock? #t)
+;; ;; ;; ((>lock-age
+;; ;; ;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180)))
+;; ;; ;; (server:release-sync-lock)
+;; ;; ;; (server:have-sync-lock?))
+;; ;; ;; (else #f))))
+;; ;;
+;; ;; ;; moving this here as it needs access to db and cannot be in common.
+;; ;; ;;
+;; ;;
+;; ;; (define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f))
+;; ;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")
+;; ;; (lambda ()
+;; ;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!"))
+;; ;; #;(let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh
+;; ;; (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log")))
+;; ;; (tmp-area (common:get-db-tmp-area))
+;; ;; (tmp-db (conc tmp-area "/megatest.db"))
+;; ;; (staging-file (conc *toppath* "/.megatest.db"))
+;; ;; (mtdbfile (conc *toppath* "/megatest.db"))
+;; ;; (lockfile (common:get-sync-lock-filepath))
+;; ;; (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log))
+;; ;; (sync-cmd (if fork-to-background
+;; ;; (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"")
+;; ;; sync-cmd-core))
+;; ;; (default-min-intersync-delay 2)
+;; ;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay))
+;; ;; (default-duty-cycle 0.1)
+;; ;; (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle))
+;; ;; (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle)
+;; ;; (calculate-off-time (lambda (work-duration duty-cycle)
+;; ;; (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds)))
+;; ;; (off-time min-intersync-delay) ;; adjusted in closure below.
+;; ;; (do-a-sync
+;; ;; (lambda ()
+;; ;; (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync)
+;; ;; (let* ((finalres
+;; ;; (let retry-loop ((num-tries 0))
+;; ;; (if (common:simple-file-lock lockfile)
+;; ;; (begin
+;; ;; (cond
+;; ;; ((not (or fork-to-background persist-until-sync))
+;; ;; (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay
+;; ;; " , off-time="off-time" seconds ]")
+;; ;; (thread-sleep! (max off-time min-intersync-delay)))
+;; ;; (else
+;; ;; (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit...")))
+;; ;;
+;; ;; (if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
+;; ;; (common:snapshot-file mtdbfile subdir: ".db-snapshot"))
+;; ;; (delete-file* staging-file)
+;; ;; (let* ((start-time (current-milliseconds))
+;; ;; (res (system sync-cmd))
+;; ;; (dbbackupfile (conc mtdbfile ".backup"))
+;; ;; (res2
+;; ;; (cond
+;; ;; ((eq? 0 res )
+;; ;; (handle-exceptions
+;; ;; exn
+;; ;; #f
+;; ;; (if (file-exists? dbbackupfile)
+;; ;; (delete-file* dbbackupfile)
+;; ;; )
+;; ;; (if (eq? 0 (file-size sync-log))
+;; ;; (delete-file* sync-log))
+;; ;; (system (conc "/bin/mv " staging-file " " mtdbfile))
+;; ;;
+;; ;; (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))
+;; ;; (set! off-time (calculate-off-time
+;; ;; last-sync-seconds
+;; ;; (cond
+;; ;; ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1))
+;; ;; duty-cycle)
+;; ;; (else
+;; ;; (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle)
+;; ;; default-duty-cycle))))
+;; ;;
+;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec")
+;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time)
+;; ;; 'sync-completed))
+;; ;; (else
+;; ;; (system (conc "/bin/cp "sync-log" "sync-log".fail"))
+;; ;; (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail")
+;; ;; (if (file-exists? (conc mtdbfile ".backup"))
+;; ;; (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile)))
+;; ;; #f))))
+;; ;; (common:simple-file-release-lock lockfile)
+;; ;; (BB> "released lockfile: " lockfile)
+;; ;; (when (common:file-exists? lockfile)
+;; ;; (BB> "DID NOT ACTUALLY RELEASE LOCKFILE"))
+;; ;; res2) ;; end let
+;; ;; );; end begin
+;; ;; ;; else
+;; ;; (cond
+;; ;; (persist-until-sync
+;; ;; (thread-sleep! 1)
+;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)")
+;; ;; (retry-loop (add1 num-tries)))
+;; ;; (else
+;; ;; (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay)))
+;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.")
+;; ;; 'parallel-sync-in-progress))
+;; ;; ) ;; end if got lockfile
+;; ;; )
+;; ;; ))
+;; ;; (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres)
+;; ;; finalres)
+;; ;; ) ;; end lambda
+;; ;; ))
+;; ;; do-a-sync))
+;; ;;
+;; ;;
)