;;======================================================================
;; Copyright 2019, Matthew Welland.
;;
;; This file is part of Megatest.
;;
;; Megatest is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; Megatest is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see <http://www.gnu.org/licenses/>.
;;======================================================================
(declare (unit rmtmod))
;; (declare (uses commonmod))
(declare (uses dbmod))
(module rmtmod
*
(import scheme chicken data-structures extras)
(import (prefix sqlite3 sqlite3:) posix typed-records srfi-18 srfi-69 format ports srfi-1 matchable)
;; (import commonmod) ;;; DO NOT ALLOW rmt*scm TO DEPEND ON common*scm!!!!
(import dbmod)
(use (prefix ulex ulex:))
(include "common_records.scm")
(define (rmt:open-qry-close-locally log-port multi-sync-mutex cmd run-id params alldat #!key (remretries 5))
(let* ((ro-queries (alldat-read-only-queries alldat))
(qry-is-write (not (member cmd ro-queries)))
(db-file-path (common:get-db-tmp-area alldat)) ;; 0))
(dbstruct-local (db:setup #t)) ;; make-dbr:dbstruct path: dbdir local: #t)))
(read-only (not (file-write-access? db-file-path)))
(start (current-milliseconds))
(resdat (if (not (and read-only qry-is-write))
(let ((v (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))))
(handle-exceptions ;; there has been a
;; long history of
;; receiving strange
;; errors from
;; values returned
;; by the client
;; when things go
;; wrong..
exn ;; This is an attempt to detect that situation and recover gracefully
(begin
(debug:print 0 log-port "ERROR: bad data from server " v " message: "
((condition-property-accessor 'exn 'message) exn))
(vector #t '())) ;; should always
;; get a vector but
;; if something
;; goes wrong
;; return a dummy
(if (and (vector? v)
(> (vector-length v) 1))
(let ((newvec (vector (vector-ref v 0)(vector-ref v 1))))
newvec) ;; by copying the vector
;; while inside the
;; error handler we
;; should force the
;; detection of a
;; corrupted record
(vector #t '())))) ;; we could also check that the returned types are valid
(vector #t '())))
(success (vector-ref resdat 0))
(res (vector-ref resdat 1))
(duration (- (current-milliseconds) start)))
(if (and read-only qry-is-write)
(debug:print 0 log-port "ERROR: attempt to write to read-only database ignored. cmd=" cmd))
(if (not success)
(if (> remretries 0)
(begin
(debug:print-error 0 log-port "local query failed. Trying again.")
(thread-sleep! (/ (random 5000) 1000)) ;; some random delay
(rmt:open-qry-close-locally log-port multi-sync-mutex cmd run-id params alldat remretries: (- remretries 1)))
(begin
(debug:print-error 0 log-port "too many retries in rmt:open-qry-close-locally, giving up")
#f))
(begin
;; (rmt:update-db-stats run-id cmd params duration)
;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
#;(if qry-is-write
(let ((start-time (current-seconds)))
(mutex-lock! multi-sync-mutex)
(set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client)
(mutex-unlock! multi-sync-mutex)))))
res))
(define (rmtmod:calc-ro-mode areadat toppath)
(if (and areadat
(alldat-ro-mode-checked areadat))
(alldat-ro-mode areadat)
(let* ((dbfile (conc toppath "/megatest.db"))
(ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or areadat to figure this out in future
(if areadat
(begin
(alldat-ro-mode-set! areadat ro-mode)
(alldat-ro-mode-checked-set! areadat #t)
ro-mode)
ro-mode))))
(define (extras-readonly-mode rmt-mutex log-port cmd params)
;;(mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 3")
(debug:print 0 log-port "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params)
#f)
(define (extras-transport-failed log-port rmt-mutex attemptnum areadat areapath cmd rid params alldat)
(debug:print 0 log-port "WARNING: communication failed. Trying again, try num: " attemptnum)
;;(mutex-lock! rmt-mutex)
(alldat-conndat-set! areadat #f)
(exec-fn 'http-transport:close-connections area-dat: areadat)
(alldat-server-url-set! areadat #f)
;;(mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 9.1")
(rmt:send-receive-orig log-port areadat rmt-mutex areapath cmd rid params alldat attemptnum: (+ attemptnum 1)))
(define (extras-transport-succeded log-port rmt-mutex attemptnum areadat areapath res params rid cmd alldat)
(if (and (vector? res)
(eq? (vector-length res) 2)
(eq? (vector-ref res 1) 'overloaded)) ;; since we are
;; looking at the
;; data to carry the
;; error we'll use a
;; fairly obtuse
;; combo to minimise
;; the chances of
;; some sort of
;; collision. this
;; is the case where
;; the returned data
;; is bad or the
;; server is
;; overloaded and we
;; want to ease off
;; the queries
(let ((wait-delay (+ attemptnum (* attemptnum 10))))
(debug:print 0 log-port "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
;;(mutex-lock! rmt-mutex)
(exec-fn 'http-transport:close-connections area-dat: areadat)
;; (set! *areadat* #f) ;; force starting over
(alldat-server-url-set! areadat #f) ;; I am hoping this will force a redo on server connection. NOT TESTED
;;(mutex-unlock! rmt-mutex)
(thread-sleep! wait-delay)
(rmt:send-receive-orig log-port areadat rmt-mutex areapath cmd rid params alldat attemptnum: (+ attemptnum 1)))
res)) ;; All good, return res
;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
;; add multi-sync-mutex
;;
(define (rmt:send-receive-orig log-port areadat rmt-mutex toppath multi-sync-mutex cmd rid params alldat #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
#;(common:telemetry-log (conc "rmt:"(->string cmd))
payload: `((rid . ,rid)
(params . ,params)))
;; do all the prep locked under the rmt-mutex
;;(mutex-lock! rmt-mutex)
;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in areadat
;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
;; 3. do the query, if on homehost use local access
;;
(let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
(readonly-mode (rmtmod:calc-ro-mode areadat toppath)))
;; (assert (not (pair? (alldat-hh-dat areadat))))
;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
(cond
;; give up if more than 15 attempts
((> attemptnum 15)
(debug:print 0 log-port "ERROR: 15 tries to start/connect to server. Giving up.")
(exit 1))
;; readonly mode, read request- handle it - case 2
((and readonly-mode
(member cmd api:read-only-queries))
;; (mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 2")
(rmt:open-qry-close-locally log-port multi-sync-mutex cmd 0 params alldat)
)
;; readonly mode, write request. Do nothing, return #f
(readonly-mode (extras-readonly-mode rmt-mutex log-port cmd params))
;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
;;
;; reset the connection if it has been unused too long
((and areadat
(alldat-conndat areadat)
(> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
(+ (http-transport:server-dat-get-last-access (alldat-conndat areadat))
(alldat-server-timeout areadat))))
(debug:print-info 0 log-port "Connection to " (alldat-server-url areadat) " expired due to no accesses, forcing new connection.")
(exec-fn 'http-transport:close-connections area-dat: areadat)
(alldat-conndat-set! areadat #f) ;; invalidate the connection, thus forcing a new connection.
;; (mutex-unlock! rmt-mutex)
(rmt:send-receive-orig log-port areadat rmt-mutex toppath multi-sync-mutex cmd rid params alldat attemptnum: attemptnum))
;; on homehost and this is a read
((and (not (alldat-force-server areadat)) ;; honor forced use of server, i.e. server NOT required
(pair? (alldat-hh-dat areadat))
(cdr (alldat-hh-dat areadat)) ;; on homehost
(member cmd api:read-only-queries)) ;; this is a read
;; (mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 5")
(rmt:open-qry-close-locally log-port multi-sync-mutex cmd 0 params alldat))
;; on homehost and this is a write, we already have a server, but server has died
((and (cdr (alldat-hh-dat areadat)) ;; on homehost
(not (member cmd api:read-only-queries)) ;; this is a write
(alldat-server-url areadat) ;; have a server
(not (exec-fn 'server:ping (alldat-server-url areadat)))) ;; server has died. NOTE: this is not a cheap call! Need better approach.
;; (set! *areadat* (make-remote)) ;; WARNING - broken this.
(alldat-force-server-set! areadat (exec-fn 'common:force-server?))
;; (mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 6")
(rmt:send-receive-orig log-port areadat rmt-mutex toppath multi-sync-mutex cmd rid params alldat attemptnum: attemptnum))
;; on homehost and this is a write, we already have a server
((and (not (alldat-force-server areadat)) ;; honor forced use of server, i.e. server NOT required
(cdr (alldat-hh-dat areadat)) ;; on homehost
(not (member cmd api:read-only-queries)) ;; this is a write
(alldat-server-url areadat)) ;; have a server
;;(mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 4.1")
(rmt:open-qry-close-locally log-port multi-sync-mutex cmd 0 params alldat))
;; on homehost, no server contact made and this is a write, passively start a server
((and (not (alldat-force-server areadat)) ;; honor forced use of server, i.e. server NOT required
(cdr (alldat-hh-dat areadat)) ;; have homehost
(not (alldat-server-url areadat)) ;; no connection yet
(not (member cmd api:read-only-queries))) ;; not a read-only query
(debug:print-info 12 log-port "rmt:send-receive, case 8")
(let ((server-url (exec-fn 'server:check-if-running toppath))) ;; (server:read-dotserver->url toppath))) ;; (server:check-if-running toppath))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
(if server-url
(alldat-server-url-set! areadat server-url) ;; the string can be consumed by the client setup if needed
(if (exec-fn 'common:force-server?)
(exec-fn 'server:start-and-wait toppath)
(exec-fn 'server:kind-run toppath))))
(alldat-force-server-set! areadat (exec-fn 'common:force-server?))
;; (mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 8.1")
(rmt:open-qry-close-locally log-port multi-sync-mutex cmd 0 params alldat))
((or (and (alldat-force-server areadat) ;; we are forcing a server and don't yet have a connection to one
(not (alldat-conndat areadat)))
(and (not (cdr (alldat-hh-dat areadat))) ;; not on a homehost
(not (alldat-conndat areadat)))) ;; and no connection
(debug:print-info 12 log-port "rmt:send-receive, case 9, hh-dat: " (alldat-hh-dat areadat) " conndat: " (alldat-conndat areadat))
;;(mutex-unlock! rmt-mutex)
(if (not (exec-fn 'server:check-if-running toppath)) ;; who knows, maybe one has started up?
(exec-fn 'server:start-and-wait toppath))
(alldat-conndat-set! areadat (rmt:get-connection-info areadat toppath)) ;; calls client:setup which calls client:setup-http
(rmt:send-receive-orig log-port areadat rmt-mutex toppath multi-sync-mutex cmd rid params alldat attemptnum: attemptnum)) ;; TODO: add back-off timeout as
;; all set up if get this far, dispatch the query
((and (not (alldat-force-server areadat))
(cdr (alldat-hh-dat areadat))) ;; we are on homehost
;;(mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 10")
(rmt:open-qry-close-locally log-port multi-sync-mutex cmd (if rid rid 0) params alldat))
;; not on homehost, do server query
(else (extras-case-11 log-port rmt-mutex areadat toppath cmd params attemptnum rid alldat)))))
(define (extras-case-11 log-port rmt-mutex areadat areapath cmd params attemptnum rid alldat)
;; (mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 9")
;; (mutex-lock! rmt-mutex)
(let* ((conninfo (alldat-conndat areadat))
(dat (case (alldat-transport areadat)
((http) (condition-case ;; handling here has
;; caused a lot of
;; problems. However it
;; is needed to deal with
;; attemtped
;; communication to
;; servers that have gone
;; away
(exec-fn 'http-transport:client-api-send-receive 0 conninfo cmd params)
((commfail)(vector #f "communications fail"))
((exn)(vector #f "other fail" (print-call-chain)))))
(else
(debug:print 0 log-port "ERROR: transport " (alldat-transport areadat) " not supported")
(exit))))
(success (if (vector? dat) (vector-ref dat 0) #f))
(res (if (vector? dat) (vector-ref dat 1) #f)))
(if (and (vector? conninfo) (< 5 (vector-length conninfo)))
(http-transport:server-dat-update-last-access conninfo) ;; refresh access time
(begin
(debug:print 0 log-port "INFO: Should not get here! conninfo=" conninfo)
(set! conninfo #f)
(alldat-conndat-set! areadat #f) ;; NOTE: *areadat* is global copy of areadat. Purpose: factor out global.
(exec-fn 'http-transport:close-connections area-dat: areadat)))
(debug:print-info 13 log-port "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " areadat = " areadat)
;; (mutex-unlock! rmt-mutex)
(if success ;; success only tells us that the transport was
;; successful, have to examine the data to see if
;; there was a detected issue at the other end
(extras-transport-succeded log-port rmt-mutex attemptnum areadat areapath res params rid cmd alldat)
(extras-transport-failed log-port rmt-mutex attemptnum areadat areapath cmd rid params alldat)
)))
;; if a server is either running or in the process of starting call client:setup
;; else return #f to let the calling proc know that there is no server available
;;
(define (rmt:get-connection-info areadat areapath #!key (area-dat #f)) ;; TODO: push areapath down.
(let* (;; (areadat (or area-dat areadat))
(cinfo (if (alldat? areadat)
(alldat-conndat areadat)
#f)))
(if cinfo
cinfo
(if (exec-fn 'server:check-if-running areapath)
(exec-fn 'client:setup areadat areapath)
#f))))
;;======================================================================
;; ulex and steps stuff
;;======================================================================
(define (rmtmod:setup-ulex toppath)
(ulex:make-area
dbdir: (conc toppath "/ulexdb")
pktsdir: (conc toppath "/pkts")
))
(define (rmtmod:send-receive-ulex ulex:conn cmd rid params attemptnum area-dat)
#f)
(use trace)(trace-call-sites #t)
;; (trace member rmtmod:calc-ro-mode rmt:open-qry-close-locally)
;;======================================================================
;; L O C K I N G M E C H A N I S M S
;;======================================================================
;; faux-lock is deprecated. Please use simple-lock below
;;
(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t))
(if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count
(if (> wait-time 0)
(begin
(thread-sleep! 1)
(if (eq? wait-time 1) ;; only one second left, steal the lock
(begin
(debug:print-info 0 *default-log-port* "stealing lock for " keyname)
(common:faux-unlock keyname force: #t)))
(common:faux-lock keyname wait-time: (- wait-time 1)))
#f)
(begin
(rmt:no-sync-set keyname (conc (current-process-id)))
(equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))))
(define (common:faux-unlock keyname #!key (force #f))
(if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))
(begin
(if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname))
#t)
#f))
;; simple lock. improve and converge on this one.
;;
(define (common:simple-lock keyname)
(rmt:no-sync-get-lock keyname))
(define (common:simple-unlock keyname #!key (force #f))
(rmt:no-sync-del! keyname))
;; ideally put all this info into the db, no need to preserve it across moving homehost
;;
;; return list of
;; ( reachable? cpuload update-time )
(define (common:get-host-info hostname)
(let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data
(load (car loadinfo))
(load-sample-time (cdr loadinfo))
(load-sample-age (- (current-seconds) load-sample-time))
(loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds
(host-last-update-timeout-seconds 4)
(host-rec (hash-table-ref/default *host-loads* hostname #f))
)
(cond
((< load-sample-age loadinfo-timeout-seconds)
(list #t
load-sample-time
load))
((and host-rec
(< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds)))
(list #t
(host-last-update host-rec)
(host-last-cpuload host-rec )))
((common:unix-ping hostname)
(list #t
(current-seconds)
(alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds
(else
(list #f 0 -1) ;; bad host, don't use!
))))
;; see defstruct host at top of file.
;; host: reachable last-update last-used last-cpuload
;;
(define (common:update-host-loads-table hosts-raw)
(let* ((hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw)))
(for-each
(lambda (hostname)
(let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f)))
(if h
h
(let ((h (make-host)))
(hash-table-set! *host-loads* hostname h)
h))))
(host-info (common:get-host-info hostname))
(is-reachable (car host-info))
(last-reached-time (cadr host-info))
(load (caddr host-info)))
(host-reachable-set! rec is-reachable)
(host-last-update-set! rec last-reached-time)
(host-last-cpuload-set! rec load)))
hosts)))
;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the
;; [host-rules] section.
;;
(define (common:get-least-loaded-host hosts-raw host-type configdat)
(let* ((rdat (configf:lookup configdat "host-rules" host-type))
(rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate
(maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load
(maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs
(maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second
(hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw))
;; (best-host #f)
(get-rec (lambda (hostname)
;; (print "get-rec hostname=" hostname)
(let ((h (hash-table-ref/default *host-loads* hostname #f)))
(if h
h
(let ((h (make-host)))
(hash-table-set! *host-loads* hostname h)
h)))))
(best-load 99999)
(curr-time (current-seconds))
(get-hosts-sorted (lambda (hosts)
(sort hosts (lambda (a b)
(let ((a-rec (get-rec a))
(b-rec (get-rec b)))
;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec))
;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec))
(< (host-last-used a-rec)
(host-last-used b-rec))))))))
(debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts))
(if (null? hosts)
#f ;; no hosts to select from. All done and giving up now.
(let ((hosts-sorted (get-hosts-sorted hosts)))
(common:update-host-loads-table hosts)
(let loop ((hostname (car hosts-sorted))
(tal (cdr hosts-sorted))
(best-host #f))
(let* ((rec (get-rec hostname))
(reachable (host-reachable rec))
(load (host-last-cpuload rec))
(last-used (host-last-used rec))
(delta (- curr-time last-used))
(job-rate (if (> delta 0)
(/ 1 delta)
999)) ;; jobs per second
(new-best
(cond
((not reachable)
(debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.")
best-host)
((and (< load maxnload) ;; load is acceptable
(< job-rate maxjobrate)) ;; job rate is acceptable
(set! best-load load)
hostname)
(else best-host))))
(debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." )
(if new-best
(begin ;; found a host, return it
(debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
(host-last-used-set! rec curr-time)
new-best)
(if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
)