;;======================================================================
;; Copyright 2017, Matthew Welland.
;;
;; This file is part of Megatest.
;;
;; Megatest is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; Megatest is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see <http://www.gnu.org/licenses/>.
;;======================================================================
(declare (unit rmtmod))
(declare (uses commonmod))
(declare (uses debugprint))
(declare (uses apimod))
(declare (uses dbmod))
(declare (uses configfmod))
(declare (uses margsmod))
(declare (uses portlogger))
(module rmtmod
*
(import scheme chicken data-structures extras ports)
(import (prefix sqlite3 sqlite3:)
directory-utils
intarweb
matchable
md5
message-digest
uri-common
spiffy
spiffy-directory-listing
spiffy-request-vars
http-client
posix
posix-extras
regex
typed-records
srfi-1
srfi-13
srfi-18
srfi-69
tcp)
(import apimod)
(import commonmod)
(import debugprint)
(import dbmod)
(import configfmod)
(import margsmod)
(import portlogger)
(defstruct alldat
(areapath #f)
(ulexdat #f)
)
(define (rmtmod:calc-ro-mode runremote *toppath*)
(if (and runremote
(remote-ro-mode-checked runremote))
(remote-ro-mode runremote)
(let* ((ro-mode (not (db:writeable *toppath* "megatest.db"))))
(if runremote
(begin
(remote-ro-mode-set! runremote ro-mode)
(remote-ro-mode-checked-set! runremote #t)
ro-mode)
ro-mode))))
;;
;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
;;
;; generate entries for ~/.megatestrc with the following
;;
;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u
;;======================================================================
;; S U P P O R T F U N C T I O N S
;;======================================================================
;; if a server is either running or in the process of starting call client:setup
;; else return #f to let the calling proc know that there is no server available
;;
(define (rmt:get-connection-info areapath #!key (area-dat #f)) ;; TODO: push areapath down.
(let* ((runremote (or area-dat *runremote*))
(cinfo (if (remote? runremote)
(remote-conndat runremote)
#f)))
(if cinfo
cinfo
(if (server:check-if-running areapath)
(client:setup areapath)
#f))))
;;======================================================================
(define (create-remote-record)
(let ((rr (make-remote)))
(rmt:init-remote rr)
rr))
(define (rmt:init-remote rr)
(remote-hh-dat-set! rr (common:get-homehost)) ;
(remote-server-info-set! rr (if *toppath* (server:check-if-running *toppath*) #f))
(remote-transport-set! rr *transport-type*)
(remote-server-timeout-set! rr (server:expiration-timeout))
rr)
;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
#;(common:telemetry-log (conc "rmt:"(->string cmd))
payload: `((rid . ,rid)
(params . ,params)))
(if (> attemptnum 2)
(debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
(cond
((> attemptnum 2) (thread-sleep! 0.05))
((> attemptnum 10) (thread-sleep! 0.5))
((> attemptnum 20) (thread-sleep! 1)))
(if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))
(begin (server:run *toppath*) (thread-sleep! 3)))
;;DOT digraph megatest_state_status {
;;DOT ranksep=0;
;;DOT // rankdir=LR;
;;DOT node [shape="box"];
;;DOT "rmt:send-receive" -> MUTEXLOCK;
;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; }
;; do all the prep locked under the rmt-mutex
(mutex-lock! *rmt-mutex*)
;; set up runremote record earlier than the loop below
(if (not *runremote*) ;; can remove this one. should never get here.
(begin
(set! *runremote* (create-remote-record))
(let* ((server-info (remote-server-info *runremote*)))
(if server-info
(begin
(remote-server-url-set! *runremote* (server:record->url server-info))
(remote-server-id-set! *runremote* (server:record->id server-info)))))
#;(set! area-dat *runremote*))) ;; new runremote will come from this on next iteration
;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
;; 3. do the query, if on homehost use local access
;;
(let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
(areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
(runremote (or area-dat
*runremote*))
(attemptnum (+ 1 attemptnum))
(readonly-mode (rmtmod:calc-ro-mode runremote *toppath*)))
;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity
;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"];
;; DOT INIT_RUNREMOTE -> MUTEXLOCK;
;; ensure we have a record for our connection for given area
;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity
;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"];
;; DOT SET_HOMEHOST -> MUTEXLOCK;
;; ensure we have a homehost record
(if (not (pair? (remote-hh-dat runremote))) ;; not on homehost
(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
(remote-hh-dat-set! runremote (common:get-homehost)))
;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
(cond
;;DOT EXIT;
;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" }
;; give up if more than 150 attempts
((> attemptnum 150)
(debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.")
(exit 1))
;;DOT CASE2 [label="local\nreadonly\nquery"];
;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2}
;;DOT CASE2 -> "rmt:open-qry-close-locally";
;; readonly mode, read request- handle it - case 2
((and readonly-mode
(member cmd api:read-only-queries))
(mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 2")
(rmt:open-qry-close-locally cmd 0 params)
)
;;DOT CASE3 [label="write in\nread-only mode"];
;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3}
;;DOT CASE3 -> "#f";
;; readonly mode, write request. Do nothing, return #f
(readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params))
;; This block was for pre-emptively resetting the connection if there had been no communication for some time.
;; I don't think it adds any value. If the server is not there, just fail and start a new connection.
;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout)
;;
;;DOT CASE4 [label="reset\nconnection"];
;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4}
;;DOT CASE4 -> "rmt:send-receive";
;; reset the connection if it has been unused too long
((and runremote
(remote-conndat runremote)
(> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on
(+ (http-transport:server-dat-get-last-access (remote-conndat runremote))
(remote-server-timeout runremote))))
(debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.")
(http-transport:close-connections area-dat: runremote)
(remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection.
(mutex-unlock! *rmt-mutex*)
(rmt:send-receive cmd rid params attemptnum: attemptnum))
;;DOT CASE5 [label="local\nread"];
;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5};
;;DOT CASE5 -> "rmt:open-qry-close-locally";
;; on homehost and this is a read
((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
(cdr (remote-hh-dat runremote)) ;; on homehost
(member cmd api:read-only-queries)) ;; this is a read
(mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 5")
(rmt:open-qry-close-locally cmd 0 params))
;;DOT CASE6 [label="init\nremote"];
;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6};
;;DOT CASE6 -> "rmt:send-receive";
;; on homehost and this is a write, we already have a server, but server has died
((and (cdr (remote-hh-dat runremote)) ;; on homehost
(not (member cmd api:read-only-queries)) ;; this is a write
(remote-server-url runremote) ;; have a server
(not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach.
(set! *runremote* (create-remote-record))
(let* ((server-info (remote-server-info *runremote*)))
(if server-info
(begin
(remote-server-url-set! *runremote* (server:record->url server-info))
(remote-server-id-set! *runremote* (server:record->id server-info)))))
(remote-force-server-set! runremote (common:force-server?))
(mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 6")
(rmt:send-receive cmd rid params attemptnum: attemptnum))
;;DOT CASE7 [label="homehost\nwrite"];
;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7};
;;DOT CASE7 -> "rmt:open-qry-close-locally";
;; on homehost and this is a write, we already have a server
((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
(cdr (remote-hh-dat runremote)) ;; on homehost
(not (member cmd api:read-only-queries)) ;; this is a write
(remote-server-url runremote)) ;; have a server
(mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1")
(rmt:open-qry-close-locally cmd 0 params))
;;DOT CASE8 [label="force\nserver"];
;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8};
;;DOT CASE8 -> "rmt:open-qry-close-locally";
;; on homehost, no server contact made and this is a write, passively start a server
((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required
(cdr (remote-hh-dat runremote)) ;; have homehost
(not (remote-server-url runremote)) ;; no connection yet
(not (member cmd api:read-only-queries))) ;; not a read-only query
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 8")
(let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call
(if server-info
(begin
(remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed
(remote-server-id-set! runremote (server:record->id server-info)))
(if (common:force-server?)
(server:start-and-wait *toppath*)
(server:kind-run *toppath*)))
(remote-force-server-set! runremote (common:force-server?))
(mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1")
(rmt:open-qry-close-locally cmd 0 params)))
;;DOT CASE9 [label="force server\nnot on homehost"];
;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9};
;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive";
((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one
(not (remote-conndat runremote)))
(and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost
(not (remote-conndat runremote)))) ;; and no connection
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote))
(mutex-unlock! *rmt-mutex*)
(if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up?
(server:start-and-wait *toppath*))
(remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http
(rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as
;;DOT CASE10 [label="on homehost"];
;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10};
;;DOT CASE10 -> "rmt:open-qry-close-locally";
;; all set up if get this far, dispatch the query
((and (not (remote-force-server runremote))
(cdr (remote-hh-dat runremote))) ;; we are on homehost
(mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 10")
(rmt:open-qry-close-locally cmd (if rid rid 0) params))
;;DOT CASE11 [label="send_receive"];
;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11};
;;DOT CASE11 -> "rmt:send-receive" [label="call failed"];
;;DOT CASE11 -> "RESULT" [label="call succeeded"];
;; not on homehost, do server query
(else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)))))
;;DOT }
;; bunch of small functions factored out of send-receive to make debug easier
;;
(define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid)
;; (mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 9")
;; (mutex-lock! *rmt-mutex*)
(let* ((conninfo (remote-conndat runremote))
(dat-in (case (remote-transport runremote)
((http) (condition-case ;; handling here has
;; caused a lot of
;; problems. However it
;; is needed to deal with
;; attemtped
;; communication to
;; servers that have gone
;; away
(http-transport:client-api-send-receive 0 conninfo cmd params)
((servermismatch) (vector #f "Server id mismatch" ))
((commfail)(vector #f "communications fail"))
((exn)(vector #f "other fail" (print-call-chain)))))
(else
(debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported")
(exit))))
;; No Title
;; Error: (vector-ref) out of range
;; #(#<condition: (exn type)> (#("db.scm:3740: regex#regexp" #f #f) #("db.scm:3739: regex#string-substitute" #f #f) #("db.scm:3738: base64#base64-decode" #f #f) #("db.scm:3737: z3#z3:decode-buffer" #f #f) #("db.scm:3736: with-input-from-string" #f #f) #("db.scm:3741: s11n#deserialize" #f #f) #("api.scm:374: api:execute-requests" #f #f) #("api.scm:139: call-with-current-continuation" #f #f) #("api.scm:139: with-exception-handler" #f #f) #("api.scm:139: ##sys#call-with-values" #f #f) #("api.scm:158: string->symbol" #f #f) #("api.scm:160: current-milliseconds" #f #f) #("api.scm:161: dbr:dbstruct-read-only" #f #f) #("api.scm:139: k15" #f #f) #("api.scm:139: g19" #f #f) #("api.scm:142: get-call-chain" #f #f)) #("get-test-info-by-id" (1102 507299)))
;; 6
;;
;; Call history:
;;
;; http-transport.scm:306: thread-terminate!
;; http-transport.scm:307: debug:print-info
;; common_records.scm:235: debug:debug-mode
;; rmt.scm:259: k587
;; rmt.scm:259: g591
;; rmt.scm:276: http-transport:server-dat-update-last-access
;; http-transport.scm:364: current-seconds
;; rmt.scm:282: debug:print-info
;; common_records.scm:235: debug:debug-mode
;; rmt.scm:283: mutex-unlock!
;; rmt.scm:287: extras-transport-succeded <--
;; +-----------------------------------------------------------------------------+
;; | Exit Status : 70
;;
(dat (if (and (vector? dat-in) ;; ... check it is a correct size
(> (vector-length dat-in) 1))
dat-in
(vector #f (conc "communications fail (type 2), dat-in=" dat-in))))
(success (if (vector? dat) (vector-ref dat 0) #f))
(res (if (vector? dat) (vector-ref dat 1) #f)))
(if (and (vector? conninfo) (< 5 (vector-length conninfo)))
(http-transport:server-dat-update-last-access conninfo) ;; refresh access time
(begin
(debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo)
(set! conninfo #f)
(remote-conndat-set! *runremote* #f) ;; NOTE: *runremote* is global copy of runremote. Purpose: factor out global.
(http-transport:close-connections area-dat: runremote)))
(debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote)
(mutex-unlock! *rmt-mutex*)
(if success ;; success only tells us that the transport was
;; successful, have to examine the data to see if
;; there was a detected issue at the other end
(extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd)
(begin
(debug:print-error 0 *default-log-port* " dat=" dat)
(extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params))
)))
(define (rmt:print-db-stats)
(let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
(debug:print 18 *default-log-port* "DB Stats\n========")
(debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
(for-each (lambda (cmd)
(let ((cmd-dat (hash-table-ref *db-stats* cmd)))
(debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0))))))
(sort (hash-table-keys *db-stats*)
(lambda (a b)
(> (vector-ref (hash-table-ref *db-stats* a) 0)
(vector-ref (hash-table-ref *db-stats* b) 0)))))))
(define (rmt:get-max-query-average run-id)
(mutex-lock! *db-stats-mutex*)
(let* ((runkey (conc "run-id=" run-id " "))
(cmds (filter (lambda (x)
(substring-index runkey x))
(hash-table-keys *db-stats*)))
(res (if (null? cmds)
(cons 'none 0)
(let loop ((cmd (car cmds))
(tal (cdr cmds))
(max-cmd (car cmds))
(res 0))
(let* ((cmd-dat (hash-table-ref *db-stats* cmd))
(tot (vector-ref cmd-dat 0))
(curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction
(currmax (max res curravg))
(newmax-cmd (if (> curravg res) cmd max-cmd)))
(if (null? tal)
(if (> tot 10)
(cons newmax-cmd currmax)
(cons 'none 0))
(loop (car tal)(cdr tal) newmax-cmd currmax)))))))
(mutex-unlock! *db-stats-mutex*)
res))
(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
(let* ((qry-is-write (not (member cmd api:read-only-queries)))
(db-file-path (common:get-db-tmp-area)) ;; db:dbfile-path)) ;; 0))
(dbstruct-local (db:setup #t)) ;; make-dbr:dbstruct path: dbdir local: #t)))
(read-only (not (file-write-access? db-file-path)))
(start (current-milliseconds))
(resdat (if (not (and read-only qry-is-write))
(let ((v (api:execute-requests dbstruct-local (vector (symbol->string cmd) params))))
(handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong..
exn ;; This is an attempt to detect that situation and recover gracefully
(begin
(debug:print 0 *default-log-port* "ERROR: bad data from server " v " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
(vector #t '())) ;; should always get a vector but if something goes wrong return a dummy
(if (and (vector? v)
(> (vector-length v) 1))
(let ((newvec (vector (vector-ref v 0)(vector-ref v 1))))
newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record
(vector #t '())))) ;; we could also check that the returned types are valid
(vector #t '())))
(success (vector-ref resdat 0))
(res (vector-ref resdat 1))
(duration (- (current-milliseconds) start)))
(if (and read-only qry-is-write)
(debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd))
(if (not success)
(if (> remretries 0)
(begin
(debug:print-error 0 *default-log-port* "local query failed. Trying again.")
(thread-sleep! (/ (random 5000) 1000)) ;; some random delay
(rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1)))
(begin
(debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up")
#f))
(begin
;; (rmt:update-db-stats run-id cmd params duration)
;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
(if qry-is-write
(let ((start-time (current-seconds)))
(mutex-lock! *db-multi-sync-mutex*)
(set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client)
(mutex-unlock! *db-multi-sync-mutex*)))))
res))
(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
(let* ((run-id (if run-id run-id 0))
(res (handle-exceptions
exn
(begin
(print "transport failed. exn=" exn)
#f)
(http-transport:client-api-send-receive run-id connection-info cmd params))))
(if (and res (vector-ref res 0))
(vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!!
#f)))
;;======================================================================
;;
;; A C T U A L A P I C A L L S
;;
;;======================================================================
;;======================================================================
;; S E R V E R
;;======================================================================
(define (rmt:kill-server run-id)
(rmt:send-receive 'kill-server run-id (list run-id)))
(define (rmt:start-server run-id)
(rmt:send-receive 'start-server 0 (list run-id)))
;;======================================================================
;; M I S C
;;======================================================================
(define (rmt:login run-id)
(assert *my-client-signature* "ERROR: login attempted without first calling (client:get-signature).")
(rmt:send-receive 'login run-id (list *toppath* megatest-version *my-client-signature*)))
;; This login does no retries under the hood - it acts a bit like a ping.
;; Deprecated for nmsg-transport.
;;
(define (rmt:login-no-auto-client-setup connection-info)
(case *transport-type* ;; run-id of 0 is just a placeholder
((http)(rmt:send-receive-no-auto-client-setup connection-info 'login 0 (list *toppath* megatest-version *my-client-signature*)))
;;((nmsg)(nmsg-transport:client-api-send-receive run-id connection-info 'login (list *toppath* megatest-version run-id *my-client-signature*)))
))
;; hand off a call to one of the db:queries statements
;; added run-id to make looking up the correct db possible
;;
(define (rmt:general-call stmtname run-id . params)
(rmt:send-receive 'general-call run-id (append (list stmtname run-id) params)))
;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host
(define (rmt:get-latest-host-load hostname)
(rmt:send-receive 'get-latest-host-load 0 (list hostname)))
(define (rmt:sdb-qry qry val run-id)
;; add caching if qry is 'getid or 'getstr
(rmt:send-receive 'sdb-qry run-id (list qry val)))
;; NOT COMPLETED
(define (rmt:runtests user run-id testpatt params)
(rmt:send-receive 'runtests run-id testpatt))
(define (rmt:get-run-record-ids target run keynames test-patt)
(rmt:send-receive 'get-run-record-ids #f (list target run keynames test-patt)))
(define (rmt:get-changed-record-ids since-time)
(rmt:send-receive 'get-changed-record-ids #f (list since-time)) )
(define (rmt:drop-all-triggers)
(rmt:send-receive 'drop-all-triggers #f '()))
(define (rmt:create-all-triggers)
(rmt:send-receive 'create-all-triggers #f '()))
;;======================================================================
;; T E S T M E T A
;;======================================================================
(define (rmt:get-tests-tags)
(rmt:send-receive 'get-tests-tags #f '()))
;;======================================================================
;; K E Y S
;;======================================================================
;; These require run-id because the values come from the run!
;;
(define (rmt:get-key-val-pairs run-id)
(rmt:send-receive 'get-key-val-pairs run-id (list run-id)))
(define (rmt:get-keys)
(if *db-keys* *db-keys*
(let ((res (rmt:send-receive 'get-keys #f '())))
(set! *db-keys* res)
res)))
(define (rmt:get-keys-write) ;; dummy query to force server start
(let ((res (rmt:send-receive 'get-keys-write #f '())))
(set! *db-keys* res)
res))
;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe
;; to cache the resuls in a hash
;;
(define (rmt:get-key-vals run-id)
(or (hash-table-ref/default *keyvals* run-id #f)
(let ((res (rmt:send-receive 'get-key-vals #f (list run-id))))
(hash-table-set! *keyvals* run-id res)
res)))
(define (rmt:get-targets)
(rmt:send-receive 'get-targets #f '()))
(define (rmt:get-target run-id)
(rmt:send-receive 'get-target run-id (list run-id)))
(define (rmt:get-run-times runpatt targetpatt)
(rmt:send-receive 'get-run-times #f (list runpatt targetpatt )))
;;======================================================================
;; T E S T S
;;======================================================================
;; Just some syntatic sugar
(define (rmt:register-test run-id test-name item-path)
(rmt:general-call 'register-test run-id run-id test-name item-path))
(define (rmt:get-test-id run-id testname item-path)
(rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
;; run-id is NOT used
;;
(define (rmt:get-test-info-by-id run-id test-id)
(if (number? test-id)
(rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
(begin
(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
(print-call-chain (current-error-port))
#f)))
(define (rmt:test-get-rundir-from-test-id run-id test-id)
(rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id)))
(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f))
(let* ((test-path (if (string? work-area)
work-area
(rmt:test-get-rundir-from-test-id run-id test-id))))
(debug:print 3 *default-log-port* "TEST PATH: " test-path)
(open-test-db test-path)))
;; WARNING: This currently bypasses the transaction wrapped writes system
(define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment)
(rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment)))
(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus)
(rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus)))
(define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)
;; (if (number? run-id)
(rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)))
;; (begin
;; (debug:print-error 0 *default-log-port* "rmt:get-tests-for-run called with bad run-id=" run-id)
;; (print-call-chain (current-error-port))
;; '())))
(define (rmt:get-tests-for-run-state-status run-id testpatt last-update)
(rmt:send-receive 'get-tests-for-run-state-status run-id (list run-id testpatt last-update)))
;; get stuff via synchash
(define (rmt:synchash-get run-id proc synckey keynum params)
(rmt:send-receive 'synchash-get run-id (list run-id proc synckey keynum params)))
(define (rmt:get-tests-for-run-mindata run-id testpatt states status not-in)
(rmt:send-receive 'get-tests-for-run-mindata run-id (list run-id testpatt states status not-in)))
;; IDEA: Threadify these - they spend a lot of time waiting ...
;;
(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in)
(let ((multi-run-mutex (make-mutex))
(run-id-list (if run-ids
run-ids
(rmt:get-all-run-ids)))
(result '()))
(if (null? run-id-list)
'()
(let loop ((hed (car run-id-list))
(tal (cdr run-id-list))
(threads '()))
(if (> (length threads) 5)
(loop hed tal (filter (lambda (th)(not (member (thread-state th) '(terminated dead)))) threads))
(let* ((newthread (make-thread
(lambda ()
(let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in))))
(if (list? res)
(begin
(mutex-lock! multi-run-mutex)
(set! result (append result res))
(mutex-unlock! multi-run-mutex))
(debug:print-error 0 *default-log-port* "get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in))))
(conc "multi-run-thread for run-id " hed)))
(newthreads (cons newthread threads)))
(thread-start! newthread)
(thread-sleep! 0.05) ;; give that thread some time to start
(if (null? tal)
newthreads
(loop (car tal)(cdr tal) newthreads))))))
result))
;; ;; IDEA: Threadify these - they spend a lot of time waiting ...
;; ;;
;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in)
;; (let ((run-id-list (if run-ids
;; run-ids
;; (rmt:get-all-run-ids))))
;; (apply append (map (lambda (run-id)
;; (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-ids testpatt states status not-in)))
;; run-id-list))))
(define (rmt:delete-test-records run-id test-id)
(rmt:send-receive 'delete-test-records run-id (list run-id test-id)))
(define (rmt:test-set-state-status run-id test-id state status msg)
(rmt:send-receive 'test-set-state-status run-id (list run-id test-id state status msg)))
(define (rmt:test-toplevel-num-items run-id test-name)
(rmt:send-receive 'test-toplevel-num-items run-id (list run-id test-name)))
;; (define (rmt:get-previous-test-run-record run-id test-name item-path)
;; (rmt:send-receive 'get-previous-test-run-record run-id (list run-id test-name item-path)))
(define (rmt:get-matching-previous-test-run-records run-id test-name item-path)
(rmt:send-receive 'get-matching-previous-test-run-records run-id (list run-id test-name item-path)))
(define (rmt:test-get-logfile-info run-id test-name)
(rmt:send-receive 'test-get-logfile-info run-id (list run-id test-name)))
(define (rmt:test-get-records-for-index-file run-id test-name)
(rmt:send-receive 'test-get-records-for-index-file run-id (list run-id test-name)))
(define (rmt:get-testinfo-state-status run-id test-id)
(rmt:send-receive 'get-testinfo-state-status run-id (list run-id test-id)))
(define (rmt:test-set-log! run-id test-id logf)
(if (string? logf)(rmt:general-call 'test-set-log run-id logf test-id)))
(define (rmt:test-set-top-process-pid run-id test-id pid)
(rmt:send-receive 'test-set-top-process-pid run-id (list run-id test-id pid)))
(define (rmt:test-get-top-process-pid run-id test-id)
(rmt:send-receive 'test-get-top-process-pid run-id (list run-id test-id)))
(define (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt)
(rmt:send-receive 'get-run-ids-matching-target #f (list keynames target res runname testpatt statepatt statuspatt)))
;; NOTE: This will open and access ALL run databases.
;;
(define (rmt:test-get-paths-matching-keynames-target-new keynames target res testpatt statepatt statuspatt runname)
(let ((run-ids (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt)))
(apply append
(map (lambda (run-id)
(rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname)))
run-ids))))
(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f))
(rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps)))
(define (rmt:get-count-tests-running-for-run-id run-id)
(if (number? run-id)
(rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))
0))
(define (rmt:get-not-completed-cnt run-id)
(rmt:send-receive 'get-not-completed-cnt run-id (list run-id)))
;; Statistical queries
(define (rmt:get-count-tests-running run-id)
(rmt:send-receive 'get-count-tests-running run-id (list run-id)))
(define (rmt:get-count-tests-running-for-testname run-id testname)
(rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname)))
(define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)
(rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup)))
;; state and status are extra hints not usually used in the calculation
;;
(define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment)
(rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment)))
(define (rmt:set-state-status-and-roll-up-run run-id state status)
(rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status)))
(define (rmt:update-pass-fail-counts run-id test-name)
(rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name))
(define (rmt:top-test-set-per-pf-counts run-id test-name)
(rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name)))
(define (rmt:get-raw-run-stats run-id)
(rmt:send-receive 'get-raw-run-stats run-id (list run-id)))
(define (rmt:get-test-times runname target)
(rmt:send-receive 'get-test-times #f (list runname target )))
;;======================================================================
;; R U N S
;;======================================================================
(define (rmt:get-run-info run-id)
(rmt:send-receive 'get-run-info run-id (list run-id)))
(define (rmt:get-num-runs runpatt)
(rmt:send-receive 'get-num-runs #f (list runpatt)))
(define (rmt:get-runs-cnt-by-patt runpatt targetpatt keys)
(rmt:send-receive 'get-runs-cnt-by-patt #f (list runpatt targetpatt keys)))
;; Use the special run-id == #f scenario here since there is no run yet
(define (rmt:register-run keyvals runname state status user contour)
(rmt:send-receive 'register-run #f (list keyvals runname state status user contour)))
(define (rmt:get-run-name-from-id run-id)
(rmt:send-receive 'get-run-name-from-id run-id (list run-id)))
(define (rmt:delete-run run-id)
(rmt:send-receive 'delete-run run-id (list run-id)))
(define (rmt:update-run-stats run-id stats)
(rmt:send-receive 'update-run-stats #f (list run-id stats)))
(define (rmt:delete-old-deleted-test-records)
(rmt:send-receive 'delete-old-deleted-test-records #f '()))
(define (rmt:get-runs runpatt count offset keypatts)
(rmt:send-receive 'get-runs #f (list runpatt count offset keypatts)))
(define (rmt:simple-get-runs runpatt count offset target last-update)
(rmt:send-receive 'simple-get-runs #f (list runpatt count offset target last-update)))
(define (rmt:get-all-run-ids)
(rmt:send-receive 'get-all-run-ids #f '()))
(define (rmt:get-prev-run-ids run-id)
(rmt:send-receive 'get-prev-run-ids #f (list run-id)))
(define (rmt:lock/unlock-run run-id lock unlock user)
(rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user)))
;; set/get status
(define (rmt:get-run-status run-id)
(rmt:send-receive 'get-run-status #f (list run-id)))
(define (rmt:get-run-state run-id)
(rmt:send-receive 'get-run-state #f (list run-id)))
(define (rmt:set-run-status run-id run-status #!key (msg #f))
(rmt:send-receive 'set-run-status #f (list run-id run-status msg)))
(define (rmt:set-run-state-status run-id state status )
(rmt:send-receive 'set-run-state-status #f (list run-id state status)))
(define (rmt:update-tesdata-on-repilcate-db old-lt new-lt)
(rmt:send-receive 'update-tesdata-on-repilcate-db #f (list old-lt new-lt)))
(define (rmt:update-run-event_time run-id)
(rmt:send-receive 'update-run-event_time #f (list run-id)))
(define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default
(rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order)))
(define (rmt:find-and-mark-incomplete run-id ovr-deadtime)
;; (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime))
(rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))) ;; )
(define (rmt:get-main-run-stats run-id)
(rmt:send-receive 'get-main-run-stats #f (list run-id)))
(define (rmt:get-var varname)
(rmt:send-receive 'get-var #f (list varname)))
(define (rmt:del-var varname)
(rmt:send-receive 'del-var #f (list varname)))
(define (rmt:set-var varname value)
(rmt:send-receive 'set-var #f (list varname value)))
(define (rmt:inc-var varname)
(rmt:send-receive 'inc-var #f (list varname)))
(define (rmt:dec-var varname)
(rmt:send-receive 'dec-var #f (list varname)))
(define (rmt:add-var varname value)
(rmt:send-receive 'add-var #f (list varname value)))
;;======================================================================
;; M U L T I R U N Q U E R I E S
;;======================================================================
;; Need to move this to multi-run section and make associated changes
(define (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f))
(let ((run-ids (rmt:get-all-run-ids)))
(for-each (lambda (run-id)
(rmt:find-and-mark-incomplete run-id ovr-deadtime))
run-ids)))
;; get the previous record for when this test was run where all keys match but runname
;; returns #f if no such test found, returns a single test record if found
;;
;; Run this at the client end since we have to connect to multiple run-id dbs
;;
(define (rmt:get-previous-test-run-record run-id test-name item-path)
(let* ((keyvals (rmt:get-key-val-pairs run-id))
(keys (rmt:get-keys))
(selstr (string-intersperse keys ","))
(qrystr (string-intersperse (map (lambda (x)(conc x "=?")) keys) " AND ")))
(if (not keyvals)
#f
(let ((prev-run-ids (rmt:get-prev-run-ids run-id)))
;; for each run starting with the most recent look to see if there is a matching test
;; if found then return that matching test record
(debug:print 4 *default-log-port* "selstr: " selstr ", qrystr: " qrystr ", keyvals: " keyvals ", previous run ids found: " prev-run-ids)
(if (null? prev-run-ids) #f
(let loop ((hed (car prev-run-ids))
(tal (cdr prev-run-ids)))
(let ((results (rmt:get-tests-for-run hed (conc test-name "/" item-path) '() '() ;; run-id testpatt states statuses
#f #f #f ;; offset limit not-in hide/not-hide
#f #f #f #f 'normal))) ;; sort-by sort-order qryvals last-update mode
(debug:print 4 *default-log-port* "Got tests for run-id " run-id ", test-name " test-name ", item-path " item-path ": " results)
(if (and (null? results)
(not (null? tal)))
(loop (car tal)(cdr tal))
(if (null? results) #f
(car results))))))))))
(define (rmt:get-run-stats)
(rmt:send-receive 'get-run-stats #f '()))
;;======================================================================
;; S T E P S
;;======================================================================
;; Getting steps is more complicated.
;;
;; If given work area
;; 1. Find the testdat.db file
;; 2. Open the testdat.db file and do the query
;; If not given the work area
;; 1. Do a remote call to get the test path
;; 2. Continue as above
;;
;;(define (rmt:get-steps-for-test run-id test-id)
;; (rmt:send-receive 'get-steps-data run-id (list test-id)))
(define (rmt:teststep-set-status! run-id test-id teststep-name state-in status-in comment logfile)
(let* ((state (items:check-valid-items "state" state-in))
(status (items:check-valid-items "status" status-in)))
(if (or (not state)(not status))
(debug:print 3 *default-log-port* "WARNING: Invalid " (if status "status" "state")
" value \"" (if status state-in status-in) "\", update your validvalues section in megatest.config"))
(rmt:send-receive 'teststep-set-status! run-id (list run-id test-id teststep-name state-in status-in comment logfile))))
(define (rmt:delete-steps-for-test! run-id test-id)
(rmt:send-receive 'delete-steps-for-test! run-id (list run-id test-id)))
(define (rmt:get-steps-for-test run-id test-id)
(rmt:send-receive 'get-steps-for-test run-id (list run-id test-id)))
(define (rmt:get-steps-info-by-id test-step-id)
(rmt:send-receive 'get-steps-info-by-id #f (list test-step-id)))
;;======================================================================
;; T E S T D A T A
;;======================================================================
(define (rmt:read-test-data run-id test-id categorypatt #!key (work-area #f))
(rmt:send-receive 'read-test-data run-id (list run-id test-id categorypatt)))
(define (rmt:read-test-data-varpatt run-id test-id categorypatt varpatt #!key (work-area #f))
(rmt:send-receive 'read-test-data-varpatt run-id (list run-id test-id categorypatt varpatt)))
(define (rmt:get-data-info-by-id test-data-id)
(rmt:send-receive 'get-data-info-by-id #f (list test-data-id)))
(define (rmt:testmeta-add-record testname)
(rmt:send-receive 'testmeta-add-record #f (list testname)))
(define (rmt:testmeta-get-record testname)
(rmt:send-receive 'testmeta-get-record #f (list testname)))
(define (rmt:testmeta-update-field test-name fld val)
(rmt:send-receive 'testmeta-update-field #f (list test-name fld val)))
(define (rmt:test-data-rollup run-id test-id status)
(rmt:send-receive 'test-data-rollup run-id (list run-id test-id status)))
(define (rmt:csv->test-data run-id test-id csvdata)
(rmt:send-receive 'csv->test-data run-id (list run-id test-id csvdata)))
;;======================================================================
;; T A S K S
;;======================================================================
(define (rmt:tasks-find-task-queue-records target run-name test-patt state-patt action-patt)
(rmt:send-receive 'find-task-queue-records #f (list target run-name test-patt state-patt action-patt)))
(define (rmt:tasks-add action owner target runname testpatt params)
(rmt:send-receive 'tasks-add #f (list action owner target runname testpatt params)))
(define (rmt:tasks-set-state-given-param-key param-key new-state)
(rmt:send-receive 'tasks-set-state-given-param-key #f (list param-key new-state)))
(define (rmt:tasks-get-last target runname)
(rmt:send-receive 'tasks-get-last #f (list target runname)))
;;======================================================================
;; N O S Y N C D B
;;======================================================================
(define (rmt:no-sync-set var val)
(rmt:send-receive 'no-sync-set #f `(,var ,val)))
(define (rmt:no-sync-get/default var default)
(rmt:send-receive 'no-sync-get/default #f `(,var ,default)))
(define (rmt:no-sync-del! var)
(rmt:send-receive 'no-sync-del! #f `(,var)))
(define (rmt:no-sync-get-lock keyname)
(rmt:send-receive 'no-sync-get-lock #f `(,keyname)))
;;======================================================================
;; A R C H I V E S
;;======================================================================
(define (rmt:archive-get-allocations testname itempath dneeded)
(rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded)))
(define (rmt:archive-register-block-name bdisk-id archive-path)
(rmt:send-receive 'archive-register-block-name #f (list bdisk-id archive-path)))
(define (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey)
(rmt:send-receive 'archive-allocate-test-to-block #f (list block-id testsuite-name areakey)))
(define (rmt:archive-register-disk bdisk-name bdisk-path df)
(rmt:send-receive 'archive-register-disk #f (list bdisk-name bdisk-path df)))
(define (rmt:test-set-archive-block-id run-id test-id archive-block-id)
(rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id)))
(define (rmt:test-get-archive-block-info archive-block-id)
(rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id)))
(define (extras-readonly-mode rmt-mutex log-port cmd params)
(mutex-unlock! rmt-mutex)
(debug:print-info 12 log-port "rmt:send-receive, case 3")
(debug:print 0 log-port "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params)
#f)
(define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params)
(debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
(mutex-lock! *rmt-mutex*)
(remote-conndat-set! runremote #f)
(http-transport:close-connections area-dat: runremote)
(remote-server-url-set! runremote #f)
(mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1")
(rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
(define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd)
(if (and (vector? res)
(eq? (vector-length res) 2)
(eq? (vector-ref res 1) 'overloaded)) ;; since we are
;; looking at the
;; data to carry the
;; error we'll use a
;; fairly obtuse
;; combo to minimise
;; the chances of
;; some sort of
;; collision. this
;; is the case where
;; the returned data
;; is bad or the
;; server is
;; overloaded and we
;; want to ease off
;; the queries
(let ((wait-delay (+ attemptnum (* attemptnum 10))))
(debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
(mutex-lock! *rmt-mutex*)
(http-transport:close-connections area-dat: runremote)
(set! *runremote* #f) ;; force starting over
(mutex-unlock! *rmt-mutex*)
(thread-sleep! wait-delay)
(rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
res)) ;; All good, return res
#;(set-functions rmt:send-receive remote-server-url-set!
http-transport:close-connections remote-conndat-set!
debug:print debug:print-info
remote-ro-mode remote-ro-mode-set!
remote-ro-mode-checked-set! remote-ro-mode-checked)
;;======================================================================
;; EVERYTHING FROM TRANSPORT
;;======================================================================
(define (http-transport:make-server-url hostport)
(if (not hostport)
#f
(conc "http://" (car hostport) ":" (cadr hostport))))
;;======================================================================
;; S E R V E R
;; ======================================================================
;; Call this to start the actual server
;;
;; (define *db:process-queue-mutex* (make-mutex))
(define (http-transport:run hostn)
;; Configurations for server
(tcp-buffer-size 2048)
(max-connections 2048)
(debug:print 2 *default-log-port* "Attempting to start the server ...")
(let* ((db #f) ;; (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily
(hostname (get-host-name))
(ipaddrstr (let ((ipstr (if (string=? "-" hostn)
;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
(server:get-best-guess-address hostname)
#f)))
(if ipstr ipstr hostn))) ;; hostname)))
(start-port (portlogger:open-run-close
(lambda (db)
(portlogger:find-port db))))
(link-tree-path (common:get-linktree))
(tmp-area (common:get-db-tmp-area))
(start-file (conc tmp-area "/.server-start")))
(debug:print-info 0 *default-log-port* "portlogger recommended port: " start-port)
;; set some parameters for the server
(root-path (if link-tree-path
link-tree-path
(current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP!
(handle-directory spiffy-directory-listing)
(handle-exception (lambda (exn chain)
(signal (make-composite-condition
(make-property-condition
'server
'message "server error")))))
;; http-transport:handle-directory) ;; simple-directory-handler)
;; Setup the web server and a /ctrl interface
;;
(vhost-map `(((* any) . ,(lambda (continue)
;; open the db on the first call
;; This is were we set up the database connections
(let* (($ (request-vars source: 'both))
(dat ($ 'dat))
(res #f))
(cond
((equal? (uri-path (request-uri (current-request)))
'(/ "api"))
(send-response body: (api:process-request *dbstruct-db* $) ;; the $ is the request vars proc
headers: '((content-type text/plain)))
(mutex-lock! *heartbeat-mutex*)
(set! *db-last-access* (current-seconds))
(mutex-unlock! *heartbeat-mutex*))
;; ((equal? (uri-path (request-uri (current-request)))
;; '(/ ""))
;; (send-response body: (http-transport:main-page)))
;;((equal? (uri-path (request-uri (current-request)))
;; '(/ "json_api"))
;; (send-response body: (http-transport:main-page)))
;;((equal? (uri-path (request-uri (current-request)))
;; '(/ "runs"))
;; (send-response body: (http-transport:main-page)))
;;((equal? (uri-path (request-uri (current-request)))
;; '(/ any))
;; (send-response body: "hey there!\n"
;; headers: '((content-type text/plain))))
;;((equal? (uri-path (request-uri (current-request)))
;; '(/ "hey"))
;; (send-response body: "hey there!\n"
;; headers: '((content-type text/plain))))
;;((equal? (uri-path (request-uri (current-request)))
;; '(/ "jquery3.1.0.js"))
;; (send-response body: (http-transport:show-jquery)
;; headers: '((content-type application/javascript))))
;;((equal? (uri-path (request-uri (current-request)))
;; '(/ "test_log"))
;; (send-response body: (http-transport:html-test-log $)
;; headers: '((content-type text/HTML))))
;;((equal? (uri-path (request-uri (current-request)))
;; '(/ "dashboard"))
;; (send-response body: (http-transport:html-dboard $)
;; headers: '((content-type text/HTML))))
(else (continue))))))))
(handle-exceptions
exn
(debug:print 0 *default-log-port* "Failed to create file " start-file ", exn=" exn)
(with-output-to-file start-file (lambda ()(print (current-process-id)))))
(http-transport:try-start-server ipaddrstr start-port)))
;; This is recursively run by http-transport:run until sucessful
;;
(define (http-transport:try-start-server ipaddrstr portnum)
(let ((config-hostname (configf:lookup *configdat* "server" "hostname"))
(config-use-proxy (equal? (configf:lookup *configdat* "client" "use-http_proxy") "yes")))
(if (not config-use-proxy)
(determine-proxy (constantly #f)))
(debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname)
(handle-exceptions
exn
(begin
(print-error-message exn)
(if (< portnum 64000)
(begin
(debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...")
(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
(debug:print 5 *default-log-port* "exn=" (condition->list exn))
(portlogger:open-run-close portlogger:set-failed portnum)
(debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port")
(thread-sleep! 0.1)
;; get_next_port goes here
(http-transport:try-start-server ipaddrstr
(portlogger:open-run-close portlogger:find-port)))
(begin
(print "ERROR: Tried and tried but could not start the server"))))
;; any error in following steps will result in a retry
(set! *server-info* (list ipaddrstr portnum))
(debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum)
;; This starts the spiffy server
;; NEED WAY TO SET IP TO #f TO BIND ALL
;; (start-server bind-address: ipaddrstr port: portnum)
(if config-hostname ;; this is a hint to bind directly
(start-server port: portnum bind-address: (if (equal? config-hostname "-")
ipaddrstr
config-hostname))
(start-server port: portnum))
(portlogger:open-run-close
(lambda (db)
(portlogger:set-port db portnum "released")))
(debug:print 1 *default-log-port* "INFO: server has been stopped"))))
;;======================================================================
;; EVERYTHING FROM SERVERMOD
;;======================================================================
(define (server:make-server-url hostport)
(if (not hostport)
#f
(conc "http://" (car hostport) ":" (cadr hostport))))
;;======================================================================
;; S E R V E R U T I L I T I E S
;;======================================================================
;; Get the transport
#;(define (server:get-transport)
(if *transport-type*
*transport-type*
(let ((ttype (string->symbol
(or (args:get-arg "-transport")
(configf:lookup *configdat* "server" "transport")
"rpc"))))
(set! *transport-type* ttype)
ttype)))
;; Generate a unique signature for this server
(define (server:mk-signature)
(message-digest-string (md5-primitive)
(with-output-to-string
(lambda ()
(write (list (current-directory)
(current-process-id)
(argv)))))))
;; When using zmq this would send the message back (two step process)
;; with spiffy or rpc this simply returns the return data to be returned
;;
(define (server:reply return-addr query-sig success/fail result)
(debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result)
;; (send-message pubsock target send-more: #t)
;; (send-message pubsock
(db:obj->string (vector success/fail query-sig result)))
;; (case (server:get-transport)
;; ((rpc) (db:obj->string (vector success/fail query-sig result)))
;; ((http) (db:obj->string (vector success/fail query-sig result)))
;; ((fs) result)
;; (else
;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*)
;; result)))
(define (server:kind-run areapath)
;; look for $MT_RUN_AREA_HOME/logs/server-start-last
;; and wait for it to be at least 3 seconds old
(server:wait-for-server-start-last-flag areapath)
(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
(let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
(call-num (car last-run-dat))
(when-run (cadr last-run-dat))
(run-delay (+ (case call-num
((0) 0)
((1) 20)
((2) 300)
(else 600))
(random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
(lock-file (conc areapath "/logs/server-start.lock")))
(if (> (- (current-seconds) when-run) run-delay)
(let* ((start-flag (conc areapath "/logs/server-start-last")))
(common:simple-file-lock-and-wait lock-file expire-time: 15)
(debug:print-info 0 *default-log-port* "server:kind-run: touching " start-flag)
(system (conc "touch " start-flag)) ;; lazy but safe
(server:run areapath)
(thread-sleep! 2) ;; don't release the lock for at least a few seconds
(common:simple-file-release-lock lock-file)))
(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
(define (server:get-num-alive srvlst)
(let ((num-alive 0))
(for-each
(lambda (server)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn))
(match-let (((mod-time host port start-time server-id pid)
server))
(let* ((uptime (- (current-seconds) mod-time))
(runtime (if start-time
(- mod-time start-time)
0)))
(if (< uptime 5)(set! num-alive (+ num-alive 1)))))))
srvlst)
num-alive))
;; given a list of servers get a list of valid servers, i.e. at least
;; 10 seconds old, has started and is less than 1 hour old and is
;; active (i.e. mod-time < 10 seconds
;;
;; mod-time host port start-time pid
;;
;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off
;; and servers should stick around for about two hours or so.
;;
(define (server:get-best srvlst)
(let* ((nums (server:get-num-servers))
(now (current-seconds))
(slst (sort
(filter (lambda (rec)
(if (and (list? rec)
(> (length rec) 2))
(let ((start-time (list-ref rec 3))
(mod-time (list-ref rec 0)))
;; (print "start-time: " start-time " mod-time: " mod-time)
(and start-time mod-time
(> (- now start-time) 0) ;; been running at least 0 seconds
(< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds
(or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set
(< (- now start-time)
(+ (- (string->number (configf:lookup *configdat* "server" "runtime"))
180)
(random 360)))) ;; under one hour running time +/- 180
))
#f))
srvlst)
(lambda (a b)
(< (list-ref a 3)
(list-ref b 3))))))
(if (> (length slst) nums)
(take slst nums)
slst)))
(define (server:get-first-best areapath)
(let ((srvrs (server:get-best (server:get-list areapath))))
(if (and srvrs
(not (null? srvrs)))
(car srvrs)
#f)))
(define (server:get-rand-best areapath)
(let ((srvrs (server:get-best (server:get-list areapath))))
(if (and (list? srvrs)
(not (null? srvrs)))
(let* ((len (length srvrs))
(idx (random len)))
(list-ref srvrs idx))
#f)))
(define (server:record->id servr)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn)
#f)
(match-let (((mod-time host port start-time server-id pid)
servr))
(if server-id
server-id
#f))))
(define (server:record->url servr)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn)
#f)
(match-let (((mod-time host port start-time server-id pid)
servr))
(if (and host port)
(conc host ":" port)
#f))))
(define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value.
(if *my-client-signature* *my-client-signature*
(let ((sig (server:mk-signature)))
(set! *my-client-signature* sig)
*my-client-signature*)))
;; wait for server=start-last to be three seconds old
;;
(define (server:wait-for-server-start-last-flag areapath)
(let* ((start-flag (conc areapath "/logs/server-start-last"))
;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
(reftime (configf:lookup-number *configdat* "server" "idletime" default: 4))
(server-key (conc (get-host-name) "-" (current-process-id))))
(if (file-exists? start-flag)
(let* ((fmodtime (file-modification-time start-flag))
(delta (- (current-seconds) fmodtime))
(all-go (> delta reftime)))
(if (and all-go
(begin
(with-output-to-file start-flag
(lambda ()
(print server-key)))
(thread-sleep! 0.25)
(let ((res (with-input-from-file start-flag
(lambda ()
(read-line)))))
(equal? server-key res))))
#t ;; (system (conc "touch " start-flag)) ;; lazy but safe
(begin
(debug:print-info 0 *default-log-port* "Gating server start, last start: "
fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
(thread-sleep! reftime)
(server:wait-for-server-start-last-flag areapath)))))))
(define (server:get-num-servers #!key (numservers 2))
(let ((ns (string->number
(or (configf:lookup *configdat* "server" "numservers") "notanumber"))))
(or ns numservers)))
(define (server:kill servr)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn)
#f)
(match-let (((mod-time hostname port start-time server-id pid)
servr))
(tasks:kill-server hostname pid))))
;; called in megatest.scm, host-port is string hostname:port
;;
;; NOTE: This is NOT called directly from clients as not all transports support a client running
;; in the same process as the server.
;;
;; run ping in separate process, safest way in some cases
;;
(define (server:ping-server ifaceport)
(with-input-from-pipe
(conc (common:get-megatest-exe) " -ping " ifaceport)
(lambda ()
(let loop ((inl (read-line))
(res "NOREPLY"))
(if (eof-object? inl)
(case (string->symbol res)
((NOREPLY) #f)
((LOGIN_OK) #t)
(else #f))
(loop (read-line) inl))))))
;; NOT USED (well, ok, reference in rpc-transport but otherwise not used).
;;
(define (server:login toppath)
(lambda (toppath)
(set! *db-last-access* (current-seconds)) ;; might not be needed.
(if (equal? *toppath* toppath)
#t
#f)))
;; timeout is hms string: 1h 5m 3s, default is 1 minute
;;
(define (server:expiration-timeout)
(let ((tmo (configf:lookup *configdat* "server" "timeout")))
(if (and (string? tmo)
(common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below
(* 3600 (string->number tmo))
60)))
;; (define server:sync-lock-token "SERVER_SYNC_LOCK")
;; (define (server:release-sync-lock)
;; (db:no-sync-del! *no-sync-db* server:sync-lock-token))
;; (define (server:have-sync-lock?)
;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token))
;; (have-lock? (car have-lock-pair))
;; (lock-time (cdr have-lock-pair))
;; (lock-age (- (current-seconds) lock-time)))
;; (cond
;; (have-lock? #t)
;; ((>lock-age
;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180)))
;; (server:release-sync-lock)
;; (server:have-sync-lock?))
;; (else #f))))
;; (define (debug:print . params) #f)
;; (define (debug:print-info . params) #f)
;;
;; (define (set-functions dbgp dbgpinfo)
;; (set! debug:print dbgp)
;; (set! debug:print-info dbgpinfo))
;; (define *server-loop-heart-beat* (current-seconds))
;;======================================================================
;; P K T S S T U F F
;;======================================================================
;; ???
;;======================================================================
;; P K T S S T U F F
;;======================================================================
;; ???
;;======================================================================
;; S E R V E R
;;======================================================================
;; Call this to start the actual server
;;
;; all routes though here end in exit ...
;;
;; start_server
;;
(define (server:launch run-id transport-type)
(case transport-type
((http)(http-transport:launch))
;;((nmsg)(nmsg-transport:launch run-id))
;;((rpc) (rpc-transport:launch run-id))
(else (debug:print-error 0 *default-log-port* "unknown server type " transport-type))))
;; Given a run id start a server process ### NOTE ### > file 2>&1
;; if the run-id is zero and the target-host is set
;; try running on that host
;; incidental: rotate logs in logs/ dir.
;;
(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area
(let* ((curr-host (get-host-name))
;; (attempt-in-progress (server:start-attempted? areapath))
;; (dot-server-url (server:check-if-running areapath))
(curr-ip (server:get-best-guess-address curr-host))
(curr-pid (current-process-id))
(homehost (common:get-homehost)) ;; configf:lookup *configdat* "server" "homehost" ))
(target-host (car homehost))
(testsuite (common:get-testsuite-name))
(logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log"))
(profile-mode (or (configf:lookup *configdat* "misc" "profilesw")
""))
(cmdln (conc (common:get-megatest-exe)
" -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes")
" -daemonize "
"")
;; " -log " logfile
" -m testsuite:" testsuite
" " profile-mode
)) ;; (conc " >> " logfile " 2>&1 &")))))
(log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!?
(load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0)))
;; we want the remote server to start in *toppath* so push there
(push-directory areapath)
(debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...")
(thread-start! log-rotate)
;; host.domain.tld match host?
(if (and target-host
;; look at target host, is it host.domain.tld or ip address and does it
;; match current ip or hostname
(not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host))
(not (equal? curr-ip target-host)))
(begin
(debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
(setenv "TARGETHOST" target-host)))
(setenv "TARGETHOST_LOGF" logfile)
(thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
#;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
(system (conc "nbfake " cmdln))
(unsetenv "TARGETHOST_LOGF")
(if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
(thread-join! log-rotate)
(pop-directory)))
(define (server:ping host-port-in server-id #!key (do-exit #f))
(let ((host:port (if (not host-port-in) ;; use read-dotserver to find
#f ;; (server:check-if-running *toppath*)
;; (if (number? host-port-in) ;; we were handed a server-id
;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in)))
;; ;; (print "srec: " srec " host-port-in: " host-port-in)
;; (if srec
;; (conc (vector-ref srec 3) ":" (vector-ref srec 4))
;; (conc "no such server-id " host-port-in)))
host-port-in))) ;; )
(let* ((host-port (if host:port
(let ((slst (string-split host:port ":")))
(if (eq? (length slst) 2)
(list (car slst)(string->number (cadr slst)))
#f))
#f)))
;; (toppath (launch:setup)))
;; (print "host-port=" host-port)
(if (not host-port)
(begin
(if host-port-in
(debug:print 0 *default-log-port* "ERROR: bad host:port"))
(if do-exit (exit 1))
#f)
(let* ((iface (car host-port))
(port (cadr host-port))
(server-dat (http-transport:client-connect iface port server-id))
(login-res (rmt:login-no-auto-client-setup server-dat)))
(if (and (list? login-res)
(car login-res))
(begin
;; (print "LOGIN_OK")
(if do-exit (exit 0))
#t)
(begin
;; (print "LOGIN_FAILED")
(if do-exit (exit 1))
#f)))))))
;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
;; this one seems to be the general entry point
;;
(define (server:start-and-wait areapath #!key (timeout 60))
(let ((give-up-time (+ (current-seconds) timeout)))
(let loop ((server-info (server:check-if-running areapath))
(try-num 0))
(if (or server-info
(> (current-seconds) give-up-time)) ;; server-url will be #f if no server available.
(server:record->url server-info)
(let ((num-ok (length (server:get-best (server:get-list areapath)))))
(if (and (> try-num 0) ;; first time through simply wait a little while then try again
(< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one
(server:kind-run areapath))
(thread-sleep! 5)
(loop (server:check-if-running areapath)
(+ try-num 1)))))))
;; no longer care if multiple servers are started by accident. older servers will drop off in time.
;;
(define (server:check-if-running areapath) ;; #!key (numservers "2"))
(let* ((ns (server:get-num-servers))
(servers (server:get-best (server:get-list areapath))))
(if (or (and servers
(null? servers))
(not servers)
(and (list? servers)
(< (length servers) (random ns)))) ;; somewhere between 0 and numservers
#f
(let loop ((hed (car servers))
(tal (cdr servers)))
(let ((res (server:check-server hed)))
(if res
hed
(if (null? tal)
#f
(loop (car tal)(cdr tal)))))))))
;; ping the given server
;;
(define (server:check-server server-record)
(let* ((server-url (server:record->url server-record))
(server-id (server:record->id server-record))
(res (case *transport-type*
((http)(server:ping server-url server-id))
;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server)
)))
(if res
server-url
#f)))
(define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG.
;;======================================================================
;; REMANANTS OF HTTP_TRANSPORT
;;======================================================================
(define *http-mutex* (make-mutex))
;; NOTE: Large block of code from 32436b426188080f72fceb6894af541fbad9921e removed here
;; I'm pretty sure it is defunct.
;; This next block all imported en-mass from the api branch
(define *http-requests-in-progress* 0)
(define *http-connections-next-cleanup* (current-seconds))
(define (http-transport:get-time-to-cleanup)
(let ((res #f))
(mutex-lock! *http-mutex*)
(set! res (> (current-seconds) *http-connections-next-cleanup*))
(mutex-unlock! *http-mutex*)
res))
(define (http-transport:inc-requests-count)
(mutex-lock! *http-mutex*)
(set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*))
;; Use this opportunity to slow things down iff there are too many requests in flight
(if (> *http-requests-in-progress* 5)
(begin
(debug:print-info 0 *default-log-port* "Whoa there buddy, ease up...")
(thread-sleep! 1)))
(mutex-unlock! *http-mutex*))
(define (http-transport:dec-requests-count proc)
(mutex-lock! *http-mutex*)
(proc)
(set! *http-requests-in-progress* (- *http-requests-in-progress* 1))
(mutex-unlock! *http-mutex*))
(define (http-transport:dec-requests-count-and-close-all-connections)
(set! *http-requests-in-progress* (- *http-requests-in-progress* 1))
(let loop ((etime (+ (current-seconds) 5))) ;; give up in five seconds
(if (> *http-requests-in-progress* 0)
(if (> etime (current-seconds))
(begin
(thread-sleep! 0.05)
(loop etime))
(debug:print-error 0 *default-log-port* "requests still in progress after 5 seconds of waiting. I'm going to pass on cleaning up http connections"))
(close-all-connections!)))
(set! *http-connections-next-cleanup* (+ (current-seconds) 10))
(mutex-unlock! *http-mutex*))
(define (http-transport:inc-requests-and-prep-to-close-all-connections)
(mutex-lock! *http-mutex*)
(set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*)))
;; Send "cmd" with json payload "params" to serverdat and receive result
;;
(define (http-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3)(area-dat #f))
(let* ((fullurl (if (vector? serverdat)
(http-transport:server-dat-get-api-req serverdat)
(begin
(debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info")
(exit 1))))
(res (vector #f "uninitialized"))
(success #t)
(sparams (db:obj->string params transport: 'http))
(runremote (or area-dat *runremote*))
(server-id (if (vector? serverdat)
(http-transport:server-dat-get-server-id serverdat)
(begin
(debug:print 0 *default-log-port* "FATAL ERROR: http-transport:client-api-send-receive called with no server info")
(exit 1)))))
(debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds))
;; set up the http-client here
(max-retry-attempts 1)
;; consider all requests indempotent
(retry-request? (lambda (request)
#f))
;; send the data and get the response
;; extract the needed info from the http data and
;; process and return it.
(let* ((send-recieve (lambda ()
(mutex-lock! *http-mutex*)
;; (condition-case (with-input-from-request "http://localhost"; #f read-lines)
;; ((exn http client-error) e (print e)))
(set! res (vector ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time.
success
(db:string->obj
(handle-exceptions
exn
(let ((call-chain (get-call-chain))
(msg ((condition-property-accessor 'exn 'message) exn)))
(set! success #f)
(if (debug:debug-mode 1)
(debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...")
(begin
(debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".")
(debug:print 0 *default-log-port* " message: " msg ", exn=" exn)
(debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey"))
(debug:print 0 *default-log-port* " call-chain: " call-chain)))
(if runremote
(remote-conndat-set! runremote #f))
;; Killing associated server to allow clean retry.")
;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine?
(mutex-unlock! *http-mutex*)
;;; (signal (make-composite-condition
;;; (make-property-condition 'commfail 'message "failed to connect to server")))
;;; "communications failed"
(db:obj->string #f))
(with-input-from-request ;; was dat
fullurl
(list (cons 'key (or server-id "thekey"))
(cons 'cmd cmd)
(cons 'params sparams))
read-string))
transport: 'http)
0)) ;; added this speculatively
;; Shouldn't this be a call to the managed call-all-connections stuff above?
(close-all-connections!)
(mutex-unlock! *http-mutex*)
))
(time-out (lambda ()
(thread-sleep! 45)
(debug:print 0 *default-log-port* "WARNING: send-receive took more than 45 seconds!!")
#f))
(th1 (make-thread send-recieve "with-input-from-request"))
(th2 (make-thread time-out "time out")))
(thread-start! th1)
(thread-start! th2)
(thread-join! th1)
(vector-set! res 0 success)
(thread-terminate! th2)
(if (vector? res)
(if (vector-ref res 0) ;; this is the first flag or the second flag?
(let* ((res-dat (vector-ref res 1)))
(if (and (string? res-dat) (string-contains res-dat "server-id mismatch"))
(signal (make-composite-condition
(make-property-condition
'servermismatch
'message (vector-ref res 1))))
res)) ;; this is the *inner* vector? seriously? why?
(if (debug:debug-mode 11)
(let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it
(print-call-chain (current-error-port))
(debug:print-error 11 *default-log-port* "error above occured at server, res=" res) ;; " message: " ((condition-property-accessor 'exn 'message) exn))
(debug:print 11 *default-log-port* " server call chain:")
(pp (vector-ref res 1) (current-error-port))
(signal (vector-ref res 0)))
res))
(signal (make-composite-condition
(make-property-condition
'timeout
'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))))
;; careful closing of connections stored in *runremote*
;;
(define (http-transport:close-connections #!key (area-dat #f))
(let* ((runremote (or area-dat *runremote*))
(server-dat (if runremote
(remote-conndat runremote)
#f))) ;; (hash-table-ref/default *runremote* run-id #f)))
(if (vector? server-dat)
(let ((api-dat (http-transport:server-dat-get-api-uri server-dat)))
(handle-exceptions
exn
(begin
(print-call-chain *default-log-port*)
(debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn))
(close-connection! api-dat)
;;(close-idle-connections!)
#t))
#f)))
(define (make-http-transport:server-dat)(make-vector 6))
(define (http-transport:server-dat-get-iface vec) (vector-ref vec 0))
(define (http-transport:server-dat-get-port vec) (vector-ref vec 1))
(define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2))
(define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3))
(define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4))
(define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5))
;(define (http-transport:server-dat-get-socket vec) (vector-ref vec 6))
(define (http-transport:server-dat-get-server-id vec) (vector-ref vec 6))
(define (http-transport:server-dat-make-url vec)
(if (and (http-transport:server-dat-get-iface vec)
(http-transport:server-dat-get-port vec))
(conc "http://"
(http-transport:server-dat-get-iface vec)
":"
(http-transport:server-dat-get-port vec))
#f))
(define (http-transport:server-dat-update-last-access vec)
(if (vector? vec)
(vector-set! vec 5 (current-seconds))
(begin
(print-call-chain (current-error-port))
(debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!"))))
;;
;; connect
;;
(define (http-transport:client-connect iface port server-id)
(let* ((api-url (conc "http://" iface ":" port "/api"))
(api-uri (uri-reference (conc "http://" iface ":" port "/api")))
(api-req (make-request method: 'POST uri: api-uri))
(server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id)))
server-dat))
;; run http-transport:keep-running in a parallel thread to monitor that the db is being
;; used and to shutdown after sometime if it is not.
;;
(define (http-transport:keep-running)
;; if none running or if > 20 seconds since
;; server last used then start shutdown
;; This thread waits for the server to come alive
(debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server")
(let* ((sdat #f)
(tmp-area (common:get-db-tmp-area))
(started-file (conc tmp-area "/.server-started"))
(server-start-time (current-seconds))
(server-info (let loop ((start-time (current-seconds))
(changed #t)
(last-sdat "not this"))
(begin ;; let ((sdat #f))
(thread-sleep! 0.01)
(debug:print-info 0 *default-log-port* "Waiting for server alive signature")
(mutex-lock! *heartbeat-mutex*)
(set! sdat *server-info*)
(mutex-unlock! *heartbeat-mutex*)
(if (and sdat
(not changed)
(> (- (current-seconds) start-time) 2))
(begin
(debug:print-info 0 *default-log-port* "Received server alive signature")
(common:save-pkt `((action . alive)
(T . server)
(pid . ,(current-process-id))
(ipaddr . ,(car sdat))
(port . ,(cadr sdat)))
*configdat* #t)
sdat)
(begin
(debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat)
(sleep 4)
(if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes
(begin
(debug:print-error 0 *default-log-port* "transport appears to have died, exiting server")
(common:save-pkt `((action . died)
(T . server)
(pid . ,(current-process-id))
(ipaddr . ,(car sdat))
(port . ,(cadr sdat))
(msg . "Transport died?"))
*configdat* #t)
(exit))
(loop start-time
(equal? sdat last-sdat)
sdat)))))))
(iface (car server-info))
(port (cadr server-info))
(last-access 0)
(server-timeout (server:expiration-timeout))
(server-going #f)
(server-log-file (args:get-arg "-log"))) ;; always set when we are a server
(handle-exceptions
exn
(debug:print 0 *default-log-port* "Failed to create " started-file ", exn=" exn)
(with-output-to-file started-file (lambda ()(print (current-process-id)))))
(let loop ((count 0)
(server-state 'available)
(bad-sync-count 0)
(start-time (current-milliseconds)))
;; Use this opportunity to sync the tmp db to megatest.db
(if (not server-going) ;; *dbstruct-db*
(begin
(debug:print 0 *default-log-port* "SERVER: dbprep")
(set! *dbstruct-db* (db:setup #t)) ;; run-id))
(set! server-going #t)
(debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine.
(thread-start! *watchdog*)))
;; when things go wrong we don't want to be doing the various queries too often
;; so we strive to run this stuff only every four seconds or so.
(let* ((sync-time (- (current-milliseconds) start-time))
(rem-time (quotient (- 4000 sync-time) 1000)))
(if (and (<= rem-time 4)
(> rem-time 0))
(thread-sleep! rem-time)))
(if (< count 1) ;; 3x3 = 9 secs aprox
(loop (+ count 1) 'running bad-sync-count (current-milliseconds)))
;; Check that iface and port have not changed (can happen if server port collides)
(mutex-lock! *heartbeat-mutex*)
(set! sdat *server-info*)
(mutex-unlock! *heartbeat-mutex*)
(if (not (equal? sdat (list iface port)))
(let ((new-iface (car sdat))
(new-port (cadr sdat)))
(debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info")
(set! iface new-iface)
(set! port new-port)
(if (not *server-id*)
(set! *server-id* (server:mk-signature)))
(debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))
(debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
(flush-output *default-log-port*)))
;; Transfer *db-last-access* to last-access to use in checking that we are still alive
(mutex-lock! *heartbeat-mutex*)
(set! last-access *db-last-access*)
(mutex-unlock! *heartbeat-mutex*)
(if (common:low-noise-print 120 (conc "server running on " iface ":" port))
(begin
(if (not *server-id*)
(set! *server-id* (server:mk-signature)))
(debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv))
(debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*)
(flush-output *default-log-port*)))
(if (common:low-noise-print 60 "dbstats")
(begin
(debug:print 0 *default-log-port* "Server stats:")
(db:print-current-query-stats)))
(let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)))
(cond
((and *server-run*
(> (+ last-access server-timeout)
(current-seconds)))
(if (common:low-noise-print 120 "server continuing")
(debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
(let ((curr-time (current-seconds)))
(handle-exceptions
exn
(debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk? exn=" exn)
(if (not *server-overloaded*)
(change-file-times server-log-file curr-time curr-time)))))
(loop 0 server-state bad-sync-count (current-milliseconds)))
(else
(debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access))
(http-transport:server-shutdown port)))))))
(define (http-transport:server-shutdown port)
(begin
;;(BB> "http-transport:server-shutdown called")
(debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id))
;;
;; start_shutdown
;;
(set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up
(portlogger:open-run-close portlogger:set-port port "released")
(thread-sleep! 1)
;; (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*)
;; (debug:print-info 0 *default-log-port* "Number of cached writes " *number-of-writes*)
;; (debug:print-info 0 *default-log-port* "Average cached write time "
;; (if (eq? *number-of-writes* 0)
;; "n/a (no writes)"
;; (/ *writes-total-delay*
;; *number-of-writes*))
;; " ms")
;; (debug:print-info 0 *default-log-port* "Number non-cached queries " *number-non-write-queries*)
;; (debug:print-info 0 *default-log-port* "Average non-cached time "
;; (if (eq? *number-non-write-queries* 0)
;; "n/a (no queries)"
;; (/ *total-non-write-delay*
;; *number-non-write-queries*))
;; " ms")
(db:print-current-query-stats)
(common:save-pkt `((action . exit)
(T . server)
(pid . ,(current-process-id)))
*configdat* #t)
(debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting")
(exit)))
;; all routes though here end in exit ...
;;
;; start_server?
;;
(define (http-transport:launch)
;; check that a server start is in progress, pause or exit if so
(let* ((tmp-area (common:get-db-tmp-area))
(server-start (conc tmp-area "/.server-start"))
(server-started (conc tmp-area "/.server-started"))
(start-time (common:lazy-modification-time server-start))
(started-time (common:lazy-modification-time server-started))
(server-starting (< start-time started-time)) ;; if start-time is less than started-time then a server is still starting
(start-time-old (> (- (current-seconds) start-time) 5))
(cleanup-proc (lambda (msg)
(let* ((serv-fname (conc "server-" (current-process-id) "-" (get-host-name) ".log"))
(full-serv-fname (conc *toppath* "/logs/" serv-fname))
(new-serv-fname (conc *toppath* "/logs/" "defunct-" serv-fname)))
(debug:print 0 *default-log-port* msg)
(if (common:file-exists? full-serv-fname)
(system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname))
(debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname))
(exit)))))
#;(if (and (not start-time-old) ;; last server start try was less than five seconds ago
(not server-starting))
(begin
(cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting")
(exit)))
;; lets not even bother to start if there are already three or more server files ready to go
#;(let* ((num-alive (server:get-num-alive (server:get-list *toppath*))))
(if (> num-alive 3)
(begin
(cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
(exit))))
(common:save-pkt `((action . start)
(T . server)
(pid . ,(current-process-id)))
*configdat* #t)
(let* ((th2 (make-thread (lambda ()
(debug:print-info 0 *default-log-port* "Server run thread started")
(http-transport:run
(if (args:get-arg "-server")
(args:get-arg "-server")
"-")
)) "Server run"))
(th3 (make-thread (lambda ()
(debug:print-info 0 *default-log-port* "Server monitor thread started")
(http-transport:keep-running)
"Keep running"))))
(thread-start! th2)
(thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor.
(thread-start! th3)
(set! *didsomething* #t)
(thread-join! th2)
(exit))))
;; (define (http-transport:server-signal-handler signum)
;; (signal-mask! signum)
;; (handle-exceptions
;; exn
;; (debug:print 0 *default-log-port* " ... exiting ...")
;; (let ((th1 (make-thread (lambda ()
;; (thread-sleep! 1))
;; "eat response"))
;; (th2 (make-thread (lambda ()
;; (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.")
;; (thread-sleep! 3) ;; give the flush three seconds to do it's stuff
;; (debug:print 0 *default-log-port* " Done.")
;; (exit 4))
;; "exit on ^C timer")))
;; (thread-start! th2)
;; (thread-start! th1)
;; (thread-join! th2))))
;;======================================================================
;; faux-lock is deprecated. Please use simple-lock below
;;======================================================================
;;======================================================================
;;
(define (common:faux-lock keyname #!key (wait-time 8)(allow-lock-steal #t))
(if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count
(if (> wait-time 0)
(begin
(thread-sleep! 1)
(if (eq? wait-time 1) ;; only one second left, steal the lock
(begin
(debug:print-info 0 *default-log-port* "stealing lock for " keyname)
(common:faux-unlock keyname force: #t)))
(common:faux-lock keyname wait-time: (- wait-time 1)))
#f)
(begin
(rmt:no-sync-set keyname (conc (current-process-id)))
(equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))))
(define (common:faux-unlock keyname #!key (force #f))
(if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f))))
(begin
(if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname))
#t)
#f))
;;======================================================================
;;======================================================================
;; simple lock. improve and converge on this one.
;;
(define (common:simple-lock keyname)
(rmt:no-sync-get-lock keyname))
(define (common:simple-unlock keyname #!key (force #f))
(rmt:no-sync-del! keyname))
;;======================================================================
;; ideally put all this info into the db, no need to preserve it across moving homehost
;;
;; return list of
;; ( reachable? cpuload update-time )
(define (common:get-host-info hostname)
(let* ((loadinfo (rmt:get-latest-host-load hostname)) ;; if this host happens to have been recently used by a test reuse the load data
(load (car loadinfo))
(load-sample-time (cdr loadinfo))
(load-sample-age (- (current-seconds) load-sample-time))
(loadinfo-timeout-seconds 6) ;; this was 20 seconds, seems way too lax. Switch to 6 seconds
(host-last-update-timeout-seconds 4)
(host-rec (hash-table-ref/default *host-loads* hostname #f))
)
(cond
((< load-sample-age loadinfo-timeout-seconds)
(list #t
load-sample-time
load))
((and host-rec
(< (current-seconds) (+ (host-last-update host-rec) host-last-update-timeout-seconds)))
(list #t
(host-last-update host-rec)
(host-last-cpuload host-rec )))
((common:unix-ping hostname)
(list #t
(current-seconds)
(alist-ref 'adj-core-load (common:get-normalized-cpu-load hostname)))) ;; this is cheaper than you might think. get-normalized-cpu-load is cached for up to 5 seconds
(else
(list #f 0 -1) ;; bad host, don't use!
))))
(define (std-exit-procedure)
;;(common:telemetry-log-close)
(on-exit (lambda () 0))
;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
(let ((no-hurry (if *time-to-exit* ;; hurry up
#f
(begin
(set! *time-to-exit* #t)
#t))))
(debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
(if (and no-hurry (debug:debug-mode 18))
(rmt:print-db-stats))
(let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
(if *dbstruct-db* (db:close-all *dbstruct-db*)) ;; one second allocated
(if *task-db*
(let ((db (cdr *task-db*)))
(if (sqlite3:database? db)
(begin
(sqlite3:interrupt! db)
(sqlite3:finalize! db #t)
;; (vector-set! *task-db* 0 #f)
(set! *task-db* #f)))))
(http-client#close-all-connections!)
;; (if (and *runremote*
;; (remote-conndat *runremote*))
;; (begin
;; (http-client#close-all-connections!))) ;; for http-client
(if (not (eq? *default-log-port* (current-error-port)))
(close-output-port *default-log-port*))
(set! *default-log-port* (current-error-port))) "Cleanup db exit thread"))
(th2 (make-thread (lambda ()
(debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...")
(if no-hurry
(begin
(thread-sleep! 5)) ;; give the clean up few seconds to do it's stuff
(begin
(thread-sleep! 2)))
(debug:print 4 *default-log-port* " ... done")
)
"clean exit")))
(thread-start! th1)
(thread-start! th2)
(thread-join! th1)
)
)
0)
;;======================================================================
;; from metadat lookup MEGATEST_VERSION
;;
(define (common:get-last-run-version) ;; RADT => How does this work in send-receive function??; assume it is the value saved in some DB
(rmt:get-var "MEGATEST_VERSION"))
(define (common:set-last-run-version)
(rmt:set-var "MEGATEST_VERSION" (common:version-signature)))
(define (common:get-last-run-version-number)
(string->number
(substring (common:get-last-run-version) 0 6)))
(define (common:version-db-delta)
(- megatest-version (common:get-last-run-version-number)))
(define (common:version-changed?)
(not (equal? (common:get-last-run-version)
(common:version-signature))))
(define (common:api-changed? dbstruct)
(not (equal? (substring (->string megatest-version) 0 4)
(substring (conc (common:get-last-run-version)) 0 4))))
;;======================================================================
;; see defstruct host at top of file.
;; host: reachable last-update last-used last-cpuload
;;
(define (common:update-host-loads-table hosts-raw)
(let* ((hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw)))
(for-each
(lambda (hostname)
(let* ((rec (let ((h (hash-table-ref/default *host-loads* hostname #f)))
(if h
h
(let ((h (make-host)))
(hash-table-set! *host-loads* hostname h)
h))))
(host-info (common:get-host-info hostname))
(is-reachable (car host-info))
(last-reached-time (cadr host-info))
(load (caddr host-info)))
(host-reachable-set! rec is-reachable)
(host-last-update-set! rec last-reached-time)
(host-last-cpuload-set! rec load)))
hosts)))
;;======================================================================
;; go through the hosts from least recently used to most recently used, pick the first that meets the load criteral from the
;; [host-rules] section.
;;
(define (common:get-least-loaded-host hosts-raw host-type configdat)
(let* ((rdat (configf:lookup configdat "host-rules" host-type))
(rules (common:val->alist (or rdat "") convert: #t)) ;; maxnload, maxnjobs, maxjobrate
(maxnload (common:alist-ref/default 'maxnload rules 1.5)) ;; max normalized load
(maxnjobs (common:alist-ref/default 'maxnjobs rules 1.5)) ;; max normalized number of jobs
(maxjobrate (common:alist-ref/default 'maxjobrate rules (/ 1 6))) ;; max rate of submitting jobs to a given host in jobs/second
(hosts (filter (lambda (x)
(string-match (regexp "^\\S+$") x))
hosts-raw))
;; (best-host #f)
(get-rec (lambda (hostname)
;; (print "get-rec hostname=" hostname)
(let ((h (hash-table-ref/default *host-loads* hostname #f)))
(if h
h
(let ((h (make-host)))
(hash-table-set! *host-loads* hostname h)
h)))))
(best-load 99999)
(curr-time (current-seconds))
(get-hosts-sorted (lambda (hosts)
(sort hosts (lambda (a b)
(let ((a-rec (get-rec a))
(b-rec (get-rec b)))
;; (print "a=" a " a-rec=" a-rec " host-last-used=" (host-last-used a-rec))
;; (print "b=" b " b-rec=" b-rec " host-last-used=" (host-last-used b-rec))
(< (host-last-used a-rec)
(host-last-used b-rec))))))))
(debug:print 0 *default-log-port* "INFO: hosts-sorted=" (get-hosts-sorted hosts))
(if (null? hosts)
#f ;; no hosts to select from. All done and giving up now.
(let ((hosts-sorted (get-hosts-sorted hosts)))
(common:update-host-loads-table hosts)
(let loop ((hostname (car hosts-sorted))
(tal (cdr hosts-sorted))
(best-host #f))
(let* ((rec (get-rec hostname))
(reachable (host-reachable rec))
(load (host-last-cpuload rec))
(last-used (host-last-used rec))
(delta (- curr-time last-used))
(job-rate (if (> delta 0)
(/ 1 delta)
999)) ;; jobs per second
(new-best
(cond
((not reachable)
(debug:print 0 *default-log-port* "Skipping host " hostname " as it cannot be reached.")
best-host)
((and (< load maxnload) ;; load is acceptable
(< job-rate maxjobrate)) ;; job rate is acceptable
(set! best-load load)
hostname)
(else best-host))))
(debug:print 0 *default-log-port* "INFO: Trying host " hostname " with load " load ", last used " delta " seconds ago, with job-rate " job-rate " for running a test." )
(if new-best
(begin ;; found a host, return it
(debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
(host-last-used-set! rec curr-time)
new-best)
(if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
;;======================================================================
;; T E S T L A U N C H I N G P E R I T E M W I T H H O S T T Y P E S
;;======================================================================
;;
;; [hosts]
;; arm cubie01 cubie02
;; x86_64 zeus xena myth01
;; allhosts #{g hosts arm} #{g hosts x86_64}
;;
;; [host-types]
;; general #MTLOWESTLOAD #{g hosts allhosts}
;; arm #MTLOWESTLOAD #{g hosts arm}
;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo
;;
;; [host-rules]
;; # maxnload => max normalized load
;; # maxnjobs => max jobs per cpu
;; # maxjobrate => max jobs per second
;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1
;;
;; [launchers]
;; envsetup general
;; xor/%/n 4C16G
;; % nbgeneral
;;
;; [jobtools]
;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match.
;; flexi-launcher yes
;; launcher nbfake
;;
(define (common:get-launcher configdat testname itempath)
(let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher")))
(if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher
(not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no")))
(let* ((launchers (hash-table-ref/default configdat "launchers" '())))
(if (null? launchers)
fallback-launcher
(let loop ((hed (car launchers))
(tal (cdr launchers)))
(let ((patt (car hed))
(host-type (cadr hed)))
(if (tests:match patt testname itempath)
(begin
(debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type)
(let ((launcher (configf:lookup configdat "host-types" host-type)))
(if launcher
(let* ((launcher-parts (string-split launcher))
(launcher-exe (car launcher-parts)))
(if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline
(let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat))
(count 100))
(if targ-host
(conc "remrun " targ-host)
(if (> count 0)
(begin
(debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type)
(thread-sleep! (- 101 count))
(host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)
(- count 1)))
(begin
(debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type)
(exit)))))
launcher))
(begin
(debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type)
(if (null? tal)
fallback-launcher
(loop (car tal)(cdr tal)))))))
;; no match, try again
(if (null? tal)
fallback-launcher
(loop (car tal)(cdr tal))))))))
fallback-launcher)))
;;======================================================================
;; everything from client moved here
;;======================================================================
;; client:get-signature
(define (client:get-signature)
(if *my-client-signature* *my-client-signature*
(let ((sig (conc (get-host-name) " " (current-process-id))))
(set! *my-client-signature* sig)
*my-client-signature*)))
;; Not currently used! But, I think it *should* be used!!!
#;(define (client:logout serverdat)
(let ((ok (and (socket? serverdat)
(cdb:logout serverdat *toppath* (client:get-signature)))))
ok))
#;(define (client:connect iface port)
(http-transport:client-connect iface port)
#;(case (server:get-transport)
((rpc) (rpc:client-connect iface port))
((http) (http:client-connect iface port))
((zmq) (zmq:client-connect iface port))
(else (rpc:client-connect iface port))))
(define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0))
(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)
#;(case (server:get-transport)
((rpc) (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id))
((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects))
(else (rpc-transport:client-setup remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id))))
;; Do all the connection work, look up the transport type and set up the
;; connection if required.
;;
;; There are two scenarios.
;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline
;; 2. We are a run tests, list runs or other interactive process and we must figure out
;; *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)(area-dat #f))
(debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries)
(server:start-and-wait areapath)
(if (<= remaining-tries 0)
(begin
(debug:print-error 0 *default-log-port* "failed to start or connect to server")
(exit 1))
;;
;; Alternatively here, we can get the list of candidate servers and work our way
;; through them searching for a good one.
;;
(let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath))
(runremote (or area-dat *runremote*)))
(if (not server-dat) ;; no server found
(client:setup-http areapath remaining-tries: (- remaining-tries 1))
(let ((host (cadr server-dat))
(port (caddr server-dat))
(server-id (caddr (cddr server-dat))))
(debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
(if (and (not area-dat)
(not *runremote*))
(begin
;; POSSIBLE BUG. I removed the full initialization call. mrw
(set! *runremote* (create-remote-record))
(let* ((server-info (remote-server-info *runremote*)))
(if server-info
(begin
(remote-server-url-set! *runremote* (server:record->url server-info))
(remote-server-id-set! *runremote* (server:record->id server-info)))))))
(if (and host port server-id)
(let* ((start-res (http-transport:client-connect host port server-id))
(ping-res (rmt:login-no-auto-client-setup start-res)))
(if (and start-res
ping-res)
(let ((runremote (or area-dat *runremote*))) ;; it might have been generated only a few statements ago
(remote-conndat-set! runremote start-res) ;; (hash-table-set! runremote run-id start-res)
(debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
start-res)
(begin ;; login failed but have a server record, clean out the record and try again
(debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... start-res=" start-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332
(case *transport-type*
((http)(http-transport:close-connections)))
(remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id)
(thread-sleep! 1)
(client:setup-http areapath remaining-tries: (- remaining-tries 1))
)))
(begin ;; no server registered
;; (server:kind-run areapath)
(server:start-and-wait areapath)
(debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries)
(thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms.
(client:setup-http areapath remaining-tries: (- remaining-tries 1)))))))))
)