;; Copyright 2006-2017, Matthew Welland.
;;
;; This file is part of Megatest.
;;
;; Megatest is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; Megatest is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see <http://www.gnu.org/licenses/>.
;;
(require-extension (srfi 18) extras tcp s11n)
(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest
directory-utils posix-extras matchable)
(use spiffy uri-common intarweb http-client spiffy-request-vars)
(declare (unit server))
(declare (uses common))
(declare (uses db))
(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running.
;; (declare (uses synchash))
(declare (uses http-transport))
;;(declare (uses rpc-transport))
(declare (uses launch))
;; (declare (uses daemon))
(declare (uses commonmod))
(import commonmod)
(declare (uses dbmod))
(import dbmod)
(declare (uses configfmod))
(import configfmod)
(declare (uses servermod))
(import servermod)
(include "common_records.scm")
(include "db_records.scm")
(define *server-loop-heart-beat* (current-seconds))
;;======================================================================
;; P K T S S T U F F
;;======================================================================
;; ???
;;======================================================================
;; P K T S S T U F F
;;======================================================================
;; ???
;;======================================================================
;; S E R V E R
;;======================================================================
;; Call this to start the actual server
;;
;; all routes though here end in exit ...
;;
;; start_server
;;
(define (server:launch run-id transport-type)
(case transport-type
((http)(http-transport:launch))
;;((nmsg)(nmsg-transport:launch run-id))
;;((rpc) (rpc-transport:launch run-id))
(else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))))
;; Given a run id start a server process ### NOTE ### > file 2>&1
;; if the run-id is zero and the target-host is set
;; try running on that host
;; incidental: rotate logs in logs/ dir.
;;
(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area
(let* ((curr-host (get-host-name))
;; (attempt-in-progress (server:start-attempted? areapath))
;; (dot-server-url (server:check-if-running areapath))
(curr-ip (server:get-best-guess-address curr-host))
(curr-pid (current-process-id))
(homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
(target-host (car homehost))
(testsuite (common:get-testsuite-name))
(logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
(profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
""))
(cmdln (conc (common:get-megatest-exe)
" -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
" -daemonize "
"")
;; " -log " logfile
" -m testsuite:" testsuite
" " profile-mode
)) ;; (conc " >> " logfile " 2>&1 &")))))
(log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
(load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
;; we want the remote server to start in *toppath* so push there
(push-directory areapath)
(debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
(thread-start! log-rotate)
;; host.domain.tld match host?
(if (and target-host
;; look at target host, is it host.domain.tld or ip address and does it
;; match current ip or hostname
(not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
(not (equal? curr-ip target-host)))
(begin
(debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
(setenv "TARGETHOST" target-host)))
(setenv "TARGETHOST_LOGF" logfile)
(thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
#;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
(system (conc "nbfake " cmdln))
(unsetenv "TARGETHOST_LOGF")
(if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
(thread-join! log-rotate)
(pop-directory)))
(define (server:ping host-port-in server-id #!key (do-exit #f))
(let ((host:port (if (not host-port-in) ;; use read-dotserver to find
#f ;; (server:check-if-running *toppath*)
;; (if (number? host-port-in) ;; we were handed a server-id
;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
;; ;; (print "srec: " srec " host-port-in: " host-port-in)
;; (if srec
;; (conc (vector-ref srec 3) ":" (vector-ref srec 4))
;; (conc "no such server-id " host-port-in)))
host-port-in))) ;; )
(let* ((host-port (if host:port
(let ((slst (string-split host:port ":")))
(if (eq? (length slst) 2)
(list (car slst)(string->number (cadr slst)))
#f))
#f)))
;; (toppath (launch:setup)))
;; (print "host-port=" host-port)
(if (not host-port)
(begin
(if host-port-in
(debug:print 0 *default-log-port* "ERROR: bad host:port"))
(if do-exit (exit 1))
#f)
(let* ((iface (car host-port))
(port (cadr host-port))
(server-dat (http-transport:client-connect iface port server-id))
(login-res (rmt:login-no-auto-client-setup server-dat)))
(if (and (list? login-res)
(car login-res))
(begin
;; (print "LOGIN_OK")
(if do-exit (exit 0))
#t)
(begin
;; (print "LOGIN_FAILED")
(if do-exit (exit 1))
#f)))))))
;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
;; look for $MT_RUN_AREA_HOME/logs/server-start-last
;; and wait for it to be at least 3 seconds old
(server:wait-for-server-start-last-flag areapath)
(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
(let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
(call-num (car last-run-dat))
(when-run (cadr last-run-dat))
(run-delay (+ (case call-num
((0) 0)
((1) 20)
((2) 300)
(else 600))
(random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
(lock-file (conc areapath "/logs/server-start.lock")))
(if (> (- (current-seconds) when-run) run-delay)
(let* ((start-flag (conc areapath "/logs/server-start-last")))
(common:simple-file-lock-and-wait lock-file expire-time: 15)
(system (conc "touch " start-flag)) ;; lazy but safe
(server:run areapath)
(thread-sleep! 2) ;; don't release the lock for at least a few seconds
(common:simple-file-release-lock lock-file)))
(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
(let ((give-up-time (+ (current-seconds) timeout)))
(let loop ((server-info (server:check-if-running areapath))
(try-num 0))
(if (or server-info
(> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
(server:record->url server-info)
(let ((num-ok (length (server:get-best (server:get-list areapath)))))
(if (and (> try-num 0) ;; first time through simply wait a little while then try again
(< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one
(server:kind-run areapath))
(thread-sleep! 5)
(loop (server:check-if-running areapath)
(+ try-num 1)))))))
;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;; #!key (numservers "2"))
(let* ((ns (server:get-num-servers))
(servers (server:get-best (server:get-list areapath))))
(if (or (and servers
(null? servers))
(not servers)
(and (list? servers)
(< (length servers) (random ns)))) ;; somewhere between 0 and numservers
#f
(let loop ((hed (car servers))
(tal (cdr servers)))
(let ((res (server:check-server hed)))
(if res
hed
(if (null? tal)
#f
(loop (car tal)(cdr tal)))))))))
;; ping the given server
;;
(define (server:check-server server-record)
(let* ((server-url (server:record->url server-record))
(server-id (server:record->id server-record))
(res (case *transport-type*
((http)(server:ping server-url server-id))
;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
)))
(if res
server-url
#f)))
(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.
(define (server:writable-watchdog-bruteforce dbstruct)
(thread-sleep! 1) ;; delay for startup
(let* ((do-a-sync (server:get-bruteforce-syncer dbstruct))
(final-sync (server:get-bruteforce-syncer dbstruct fork-to-background: #t persist-until-sync: #t)))
(when (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync
(args:get-arg "-server"))
(let loop ()
(do-a-sync)
(if (not *time-to-exit*) (loop))) ;; keep going unless time to exit
;; time to exit, close the no-sync db here
(final-sync)
(if (common:low-noise-print 30)
(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)
)))))
(define (server:writable-watchdog-deltasync dbstruct)
(thread-sleep! 0.05) ;; delay for startup
(let ((legacy-sync (common:run-sync?))
(sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
(debug-mode (debug:debug-mode 1))
(last-time (current-seconds))
(no-sync-db (db:open-no-sync-db))
(stmt-cache (dbr:dbstruct-stmt-cache dbstruct))
(sync-duration 0) ;; run time of the sync in milliseconds
;;(this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))
)
(set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls
(debug:print-info 2 *default-log-port* "Periodic sync thread started.")
(debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id) );; " this-wd-num="this-wd-num)
(if (and legacy-sync (not *time-to-exit*))
(let* (;;(dbstruct (db:setup))
(mtdb (dbr:dbstruct-mtdb dbstruct))
(mtpath (db:dbdat-get-path mtdb))
(tmp-area (common:get-db-tmp-area))
(start-file (conc tmp-area "/.start-sync"))
(end-file (conc tmp-area "/.end-sync")))
(debug:print-info 0 *default-log-port* "Server running, periodic sync started.")
(let loop ()
;; sync for filesystem local db writes
;;
(mutex-lock! *db-multi-sync-mutex*)
(let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write
(sync-in-progress *db-sync-in-progress*)
(min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 5))
(should-sync (and (not *time-to-exit*)
(> (- (current-seconds) *db-last-sync*) min-intersync-delay))) ;; sync every five seconds minimum, deprecated logic, can probably be removed
(start-time (current-seconds))
(cpu-load-adj (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)))
(mt-mod-time (file-modification-time mtpath))
(last-sync-start (if (common:file-exists? start-file)
(file-modification-time start-file)
0))
(last-sync-end (if (common:file-exists? end-file)
(file-modification-time end-file)
10))
(sync-period (+ 3 (* cpu-load-adj 30))) ;; as adjusted load increases increase the sync period
(recently-synced (and (< (- start-time mt-mod-time) sync-period) ;; not useful if sync didn't modify megatest.db!
(< mt-mod-time last-sync-start)))
(sync-done (<= last-sync-start last-sync-end))
(sync-stale (> start-time (+ last-sync-start sync-stale-seconds)))
(will-sync (and (not *time-to-exit*) ;; do not start a sync if we are in the process of exiting
(or need-sync should-sync)
(or sync-done sync-stale)
(not sync-in-progress)
(not recently-synced))))
(debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress=" sync-in-progress
" should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync
" sync-done=" sync-done " sync-period=" sync-period)
(if (and (> sync-period 5)
(common:low-noise-print 30 "sync-period"))
(debug:print-info 0 *default-log-port* "Increased sync period due to long sync times, sync took: " sync-period " seconds."))
;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced))
;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync)
(if will-sync (set! *db-sync-in-progress* #t))
(mutex-unlock! *db-multi-sync-mutex*)
(if will-sync
(let (;; (max-sync-duration (configf:lookup-number *configdat* "server" "max-sync-duration")) ;; KEEPING THIS AVAILABLE BUT SHOULD NOT USE, I'M PRETTY SURE IT DOES NOT WORK!
(sync-start (current-milliseconds)))
(with-output-to-file start-file (lambda ()(print (current-process-id))))
;; put lock here
;; (if (or (not max-sync-duration)
;; (< sync-duration max-sync-duration)) ;; NOTE: db:sync-to-megatest.db keeps track of time of last sync and syncs incrementally
(let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
(set! sync-duration (- (current-milliseconds) sync-start))
(if (> res 0) ;; some records were transferred, keep the db alive
(begin
(mutex-lock! *heartbeat-mutex*)
(set! *db-last-access* (current-seconds))
(mutex-unlock! *heartbeat-mutex*)
(debug:print-info 0 *default-log-port* "sync called, " res " records transferred."))
(debug:print-info 2 *default-log-port* "sync called but zero records transferred")))))
;; ;; TODO: factor this next routine out into a function
;; (with-input-from-pipe ;; this should not block other threads but need to verify this
;; (conc "megatest -sync-to-megatest.db -m testsuite:" (common:get-area-name) ":" *toppath*)
;; (lambda ()
;; (let loop ((inl (read-line))
;; (res #f))
;; (if (eof-object? inl)
;; (begin
;; (set! sync-duration (- (current-milliseconds) sync-start))
;; (cond
;; ((not res)
;; (debug:print 0 *default-log-port* "ERROR: sync from /tmp db to megatest.db appears to have failed. Recommended that you stop your runs and run \"megatest -cleanup-db\""))
;; ((> res 0)
;; (mutex-lock! *heartbeat-mutex*)
;; (set! *db-last-access* (current-seconds))
;; (mutex-unlock! *heartbeat-mutex*))))
;; (let ((num-synced (let ((matches (string-match "^Synced (\\d+).*$" inl)))
;; (if matches
;; (string->number (cadr matches))
;; #f))))
;; (loop (read-line)
;; (or num-synced res))))))))))
(if will-sync
(begin
(mutex-lock! *db-multi-sync-mutex*)
(set! *db-sync-in-progress* #f)
(set! *db-last-sync* start-time)
(with-output-to-file end-file (lambda ()(print (current-process-id))))
;; release lock here
(mutex-unlock! *db-multi-sync-mutex*)))
(if (and debug-mode
(> (- start-time last-time) 60))
(begin
(set! last-time start-time)
(debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*))))))
;; keep going unless time to exit
;;
(if (not *time-to-exit*)
(let delay-loop ((count 0))
;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*)
(if (and (not *time-to-exit*)
(< count 6)) ;; was 11, changing to 4.
(begin
(thread-sleep! 1)
(delay-loop (+ count 1))))
(if (not *time-to-exit*) (loop))))
;; time to exit, close the no-sync db here
(db:no-sync-close-db no-sync-db stmt-cache)
(if (common:low-noise-print 30)
(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))))) ;;" this-wd-num="this-wd-num)))))))