Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -31,22 +31,24 @@ (declare (uses commonmod)) (import commonmod) (include "common_records.scm") -(define (remove-server-files directory-path) - (let ((files (glob (string-append directory-path "/server*")))) +(define (remove-files filespec) + (let ((files (glob filespec))) (for-each delete-file* files))) (define (stop-the-train) (thread-start! (make-thread (lambda () (let loop () (if (and *toppath* (file-exists? (conc *toppath*"/stop-the-train"))) (begin (debug:print 0 *default-log-port* "ERROR: found file "*toppath*"/stop-the-train, exiting immediately") - (remove-server-files (conc *toppath* "/logs")) + (remove-files (conc *toppath* "/logs/server*")) + (remove-files (conc *toppath* "/.servinfo/*")) + (remove-files (conc *toppath* "/.mtdb/*lock")) (exit 1))) (thread-sleep! 5) (loop)))))) ;; execute thunk, return value. If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* . @@ -681,10 +683,16 @@ "")))) (define (common:alist-ref/default key alist default) (or (alist-ref key alist) default)) + +;; The `common:low-noise-print` function is a utility that can be used to throttle the +;; frequency of certain operations. It does this by tracking the last time an operation was +;; performed and only allowing it again after a specified interval (`waitval`). This can be useful +;; for reducing noise in logs or limiting the rate of user notifications, among other use cases. + (define (common:low-noise-print waitval . keys) (let* ((key (string-intersperse (map conc keys) "-" )) (lasttime (hash-table-ref/default *common:denoise* key 0)) (currtime (current-seconds))) (if (> (- currtime lasttime) waitval) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -77,11 +77,10 @@ (if ipstr ipstr hostn))) ;; hostname))) (start-port (portlogger:open-run-close portlogger:find-port)) (link-tree-path (common:get-linktree)) (tmp-area (common:get-db-tmp-area)) (start-file (conc tmp-area "/.server-start"))) - (debug:print-info 0 *default-log-port* "portlogger recommended port: " start-port) ;; set some parameters for the server (root-path (if link-tree-path link-tree-path (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP! (handle-directory spiffy-directory-listing) @@ -464,10 +463,13 @@ (handle-exceptions exn (debug:print 0 *default-log-port* "Failed to create " started-file ", exn=" exn) (with-output-to-file started-file (lambda ()(print (current-process-id))))) + (debug:print 0 *default-log-port* "Creating servinfo file for " (get-host-name) ":" (cadr *server-info*)) + (http:create-server-registration-file *toppath* (get-host-name) (cadr *server-info*)) + (let loop ((count 0) (server-state 'available) (bad-sync-count 0) (start-time (current-milliseconds))) @@ -535,27 +537,16 @@ (set! *server-id* (server:mk-signature))) (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") - (begin + (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) - (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) - (cond - #;((and *server-run* - (> (- (current-seconds) server-start-time) 420)) ;; let's try server replacement - ;; ((adj-proc-load . 0.056875) (adj-core-load . 0.11375) (1m-load . 0.91) (5m-load . 0.77) (15m-load . 1.0) (proc . 16) (core . 8) (phys . 1)) - (let* ((loaddat (common:get-normalized-cpu-load #f)) - (adj-proc-load (alist-ref 'adj-proc-load loaddat)) - (adj-core-load (alist-ref 'adj-core-load loaddat)) - (adj-load (max adj-proc-load adj-core-load))) - (if (< adj-load 2) ;; reduce chance of runaway - (server:run *toppath*)) - (db:all-db-sync *dbstruct-dbs*) - (thread-sleep! 30) - (http-transport:server-shutdown port))) + + (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) + (cond ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) @@ -655,16 +646,35 @@ )) "Server run")) (th3 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server monitor thread started") (http-transport:keep-running) "Keep running")))) + (thread-start! th2) - (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. + (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. (thread-start! th3) (set! *didsomething* #t) (thread-join! th2) (exit)))) + +;; return servid +;; side-effects: +;; ttdat-cleanup-proc is populated with function to remove the serverinfo file +(define (http:create-server-registration-file areapath host port) + (let* ( + (servdir (server:get-servinfo-dir areapath)) + (servinf (conc servdir"/"host":"port"-"(current-process-id))) + (serv-id (server:mk-signature)) + (clean-proc (lambda () + (delete-file* servinf) + ))) + (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn") + (with-output-to-file servinf + (lambda () + (print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)))) + serv-id)) + ;; (define (http-transport:server-signal-handler signum) ;; (signal-mask! signum) ;; (handle-exceptions ;; exn Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -959,47 +959,116 @@ (if (args:get-arg "-adjutant") (begin (adjutant-run) (set! *didsomething* #t))) -(if (or (args:get-arg "-list-servers") - (args:get-arg "-kill-servers")) - (let ((tl (launch:setup))) - (if tl ;; all roads from here exit - (let* ((servers (server:get-list *toppath*)) - (fmtstr "~33a~22a~20a~20a~8a\n")) - (format #t fmtstr "ID" "host:port" "age (hms)" "Last mod" "State") - (format #t fmtstr "==" "=========" "=========" "========" "=====") - (for-each ;; ( mod-time host port start-time pid ) - (lambda (server) - (let* ((mtm (any->number (car server))) - (mod (if mtm (- (current-seconds) mtm) "unk")) - (age (- (current-seconds)(or (any->number (list-ref server 3)) (current-seconds)))) - (url (conc (cadr server) ":" (caddr server))) - (pid (list-ref server 4)) - (alv (if (number? mod)(< mod 10) #f))) - (format #t - fmtstr - pid - url - (seconds->hr-min-sec age) - (seconds->hr-min-sec mod) - (if alv "alive" "dead")) - (if (and alv - (args:get-arg "-kill-servers")) - (begin - (debug:print-info 0 *default-log-port* "Attempting to kill server with pid " pid) - (server:kill server))))) - (sort servers (lambda (a b) - (let ((ma (or (any->number (car a)) 9e9)) - (mb (or (any->number (car b)) 9e9))) - (> ma mb))))) - ;; (debug:print-info 1 *default-log-port* "Done with listservers") - (set! *didsomething* #t) - (exit)) - (exit)))) - ;; must do, would have to add checks to many/all calls below +(if (args:get-arg "-list-servers") + (let* ((tl (launch:setup)) ;; need this to initialize *toppath* + (servdir (server:get-servinfo-dir *toppath*)) + (servfiles (glob (conc servdir "/*:*.db"))) + (fmtstr "~10a~22a~10a~25a~25a~8a\n") + (dbfiles (append (glob (conc *toppath* "/.mtdb/main.db")) (glob (conc *toppath* "/.mtdb/?.db"))(glob (conc *toppath* "/.mtdb/??.db")))) + ) + (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state") + (for-each + (lambda (dbfile) + (let* ( + (dbfname (conc (pathname-file dbfile) ".db")) + ) + (let ( + (sinfos (server:get-server-info-sorted *toppath* dbfname)) + ) + (for-each + (lambda (sinfo) + (let* ( + (db (list-ref sinfo 5)) + (pid (list-ref sinfo 4)) + (host (list-ref sinfo 0)) + (port (list-ref sinfo 1)) + (server-id (list-ref sinfo 3)) + (age (seconds->hr-min-sec (- (current-seconds) (list-ref sinfo 2)))) + (last-mod (seconds->string (list-ref sinfo 2))) + (status (system (conc "ssh " host " ps " pid " > /dev/null"))) + (state (if (> status 0) + "dead" + (tt:ping host port server-id 0) + )) + ) + (format #t fmtstr db (conc host ":" port) pid age last-mod state) + ) + ) + sinfos + ) + ) + ) + ) + dbfiles + ) + (set! *didsomething* #t) + (exit) + ) +) + + +(if (args:get-arg "-kill-servers") + + (let* ((tl (launch:setup)) ;; need this to initialize *toppath* + (servdir (server:get-servinfo-dir *toppath*)) + (servfiles (glob (conc servdir "/*:*.db"))) + (fmtstr "~10a~22a~10a~25a~25a~8a\n") + (dbfiles (append (glob (conc *toppath* "/.mtdb/main.db")) (glob (conc *toppath* "/.mtdb/?.db"))(glob (conc *toppath* "/.mtdb/??.db")))) + ) + (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state") + (for-each + (lambda (dbfile) + (let* ( + (dbfname (conc (pathname-file dbfile) ".db")) + (sfiles (server:find-server *toppath* dbfname)) + ) + (for-each + (lambda (sfile) + (let ( + (sinfos (servert:get-server-info-sorted *toppath* dbfname)) + ) + (for-each + (lambda (sinfo) + (let* ( + (db (list-ref sinfo 5)) + (pid (list-ref sinfo 4)) + (host (list-ref sinfo 0)) + (port (list-ref sinfo 1)) + (server-id (list-ref sinfo 3)) + (age (seconds->hr-min-sec (- (current-seconds) (list-ref sinfo 2)))) + (last-mod (seconds->string (list-ref sinfo 2))) + (killed (system (conc "ssh " host " kill " pid " > /dev/null"))) + (dummy2 (sleep 1)) + (state (if (> (system (conc "ssh " host " ps " pid " > /dev/null")) 0) "dead" "alive")) + ) + (format #t fmtstr db (conc host ":" port) pid age last-mod state) + (delete-file* sfile) + ) + ) + sinfos + ) + ) + ) + sfiles + ) + ) + ) + dbfiles + ) + ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id. + (if (file-exists? (conc *toppath* "/.mtdb/no-sync.db")) + (delete-file (conc *toppath* "/.mtdb/no-sync.db")) + ) + (set! *didsomething* #t) + (exit) + ) +) + + ;;====================================================================== ;; Weird special calls that need to run *after* the server has started? ;;====================================================================== Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -418,10 +418,252 @@ (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) (thread-sleep! ( + 1 idletime)) (server:wait-for-server-start-last-flag areapath))))))) +(define (server:get-servinfo-dir areapath) + (let* ((spath (conc areapath"/.servinfo"))) + (if (not (file-exists? spath)) + (create-directory spath #t)) + spath)) + +;; gets server info and appends path to server file +;; sorts by age, oldest first +;; +;; returns list of (host port startseconds server-id servinfofile) +;; +(define (server:get-server-info-sorted areapath dbfname) + (let* ( + (sfiles (server:find-server areapath dbfname)) + (sdats (filter car (map server:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read + (sorted (sort sdats (lambda (a b) + (let* ((starta (list-ref a 2)) + (startb (list-ref b 2))) + (if (eq? starta startb) + (string>? (list-ref a 3)(list-ref b 3)) ;; if servers started at same time look at server-id + (< starta startb)))))) + (count 0)) + (for-each + (lambda (rec) + (if (or (> (length sorted) 1) + (common:low-noise-print 120 "server info sorted")) + (debug:print 2 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", "))) + (set! count (+ count 1))) + sorted) + sorted)) + +(define (server:clean-up-old areapath) + ;; any server file that has not been touched in ten minutes is effectively dead + (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) + (for-each + (lambda (sfile) + (let* ((modtime (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) + (current-seconds)) + (file-modification-time sfile)))) + (if (and (number? modtime) + (> (- (current-seconds) modtime) + 600)) + (begin + (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") + (handle-exceptions + exn + (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) + (delete-file sfile)))))) + sfiles))) + + + +(define server-last-start 0) + +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +;; mode: +;; best - get best server (random of newest five) +;; home - get home host based on oldest server +;; info - print info +(define (server:choose-server areapath #!optional (mode 'best)) + ;; age is current-starttime + ;; find oldest alive + ;; 1. sort by age ascending and ping until good + ;; find alive rand from youngest + ;; 1. sort by age descending + ;; 2. take five + ;; 3. check alive, discard if not and repeat + ;; first we clean up old server files + (assert (eq? (rmt:transport-mode) 'http) "FATAL: server:run called with rmt:transport-mode="(rmt:transport-mode)) + (server:clean-up-old areapath) + (let* ((since-last (- (current-seconds) server-last-start)) + (server-start-delay 10)) + (if ( < (- (current-seconds) server-last-start) 10 ) + (begin + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") + (thread-sleep! server-start-delay) + ) + (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) + ) + ) + (let* ((serversdat (server:get-servers-info areapath)) + (servkeys (hash-table-keys serversdat)) + (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last + (sort servkeys ;; list of "host:port" + (lambda (a b) + (>= (list-ref (hash-table-ref serversdat a) 2) + (list-ref (hash-table-ref serversdat b) 2)))) + '()))) + (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) + (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) + (if (not (null? by-time-asc)) + (let* ((oldest (last by-time-asc)) + (oldest-dat (hash-table-ref serversdat oldest)) + (host (list-ref oldest-dat 0)) + (all-valid (filter (lambda (x) + (equal? host (list-ref (hash-table-ref serversdat x) 0))) + by-time-asc)) + (best-ten (lambda () + (if (> (length all-valid) 11) + (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out + (if (> (length all-valid) 8) + (drop-right all-valid 1) + all-valid)))) + (names->dats (lambda (names) + (map (lambda (x) + (hash-table-ref serversdat x)) + names))) + (am-home? (lambda () + (let* ((currhost (get-host-name)) + (bestadrs (server:get-best-guess-address currhost))) + (or (equal? host currhost) + (equal? host bestadrs)))))) + (case mode + ((info) + (debug:print 0 *default-log-port* "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) + (debug:print 0 *default-log-port* "youngest: "(hash-table-ref serversdat (car all-valid)))) + ((home) host) + ((homehost) (cons host (am-home?))) ;; shut up old code + ((home?) (am-home?)) + ((best-ten)(names->dats (best-ten))) + ((all-valid)(names->dats all-valid)) + ((best) (let* ((best-ten (best-ten)) + (len (length best-ten))) + (hash-table-ref serversdat (list-ref best-ten (random len))))) + ((count)(length all-valid)) + (else + (debug:print 0 *default-log-port* "ERROR: invalid command "mode) + #f))) + (begin + (server:run areapath) + (set! server-last-start (current-seconds)) + ;; (thread-sleep! 3) + (case mode + ((homehost) (cons #f #f)) + (else #f)))))) + + + + +;; oldest server alive determines host then choose random of youngest +;; five servers on that host +;; +(define (server:get-servers-info areapath) + ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") + (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) + (if (not (file-exists? servinfodir)) + (create-directory servinfodir)) + (let* ((allfiles (glob (conc servinfodir"/*"))) + (res (make-hash-table))) + (for-each + (lambda (f) + (let* ((hostport (pathname-strip-directory f)) + (serverdat (server:logf-get-start-info f))) + (match serverdat + ((host port start server-id pid) + (if (and host port start server-id pid) + (hash-table-set! res hostport serverdat) + (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) + (else + (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) + allfiles) + res))) + + + +;; given a path to a server info file return: host port startseconds server-id pid dbfname logf +;; example of what it's looking for in the log file: +;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 +;; +(define (server:server-get-info logf) + (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id + (bad-dat (list #f #f #f #f #f #f logf))) + (let ((fdat (handle-exceptions + exn + (begin + ;; WARNING: this is potentially dangerous to blanket ignore the errors + (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn)) + '()) ;; no idea what went wrong, call it a bad server, return empty list + (with-input-from-file logf read-lines)))) + (if (null? fdat) ;; bad data, return bad-dat + bad-dat + (let loop ((inl (car fdat)) + (tail (cdr fdat)) + (lnum 0)) + (let ((mlst (string-match server-rx inl))) + (if (not mlst) + (if (> lnum 500) ;; give up if more than 500 lines of server log read + bad-dat + (if (null? tail) + bad-dat + (loop (car tail)(cdr tail)(+ lnum 1)))) + (match mlst ;; have a not null list + ((_ host port start server-id pid dbfname) + (list host + (string->number port) + (string->number start) + server-id + (string->number pid) + dbfname + logf)) + (else + (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) + bad-dat))))))))) + + + +;; find valid server +;; get servers listed, last part of name must match : +;; if more than one, wait one second and look again +;; future: ping oldest, if alive remove other : files +;; +(define (server:find-server areapath dbfname) + (let* ((servdir (server:get-servinfo-dir areapath)) + (sfiles (glob (conc servdir"/*:"dbfname))) + (good-files '())) + (for-each + (lambda (sfile) + (let* ((sinfo (tt:server-get-info sfile)) + (host (list-ref sinfo 0)) + (port (list-ref sinfo 1)) + (server-id (list-ref sinfo 3)) + (pid (list-ref sinfo 4)) + (status (system (conc "ssh " host " ps " pid " > /dev/null"))) + ) + (if (= status 0) + (set! good-files (cons sfile good-files)) + (delete-file* sfile) + ) + ) + ) + sfiles + ) + (debug:print-info 2 *default-log-port* "server:find-server: good-files: " good-files " sfiles: " sfiles) + good-files)) + + ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; @@ -437,10 +679,31 @@ (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". (common:simple-file-release-lock lock-file))) (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) + +;; return servid +;; side-effects: +;; ttdat-cleanup-proc is populated with function to remove the serverinfo file +(define (server:create-server-registration-file areapath host port) + (let* ( + (servdir (server:get-servinfo-dir areapath)) + (servinf (conc servdir"/"host":"port"-"(current-process-id))) + (serv-id (server:mk-signature areapath)) + (clean-proc (lambda () + (delete-file* servinf) + ))) + (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn") + (tt-cleanup-proc-set! ttdat clean-proc) + (tt-servinf-file-set! ttdat servinf) + (with-output-to-file servinf + (lambda () + (print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)))) + serv-id)) + + ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout)))