Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -20,26 +20,30 @@ SHELL=/bin/bash PREFIX=$(PWD) CSCOPTS= INSTALL=install SRCFILES = common.scm items.scm launch.scm ods.scm runconfig.scm \ - server.scm configf.scm db.scm keys.scm margs.scm \ - process.scm runs.scm tasks.scm tests.scm genexample.scm \ - http-transport.scm tdb.scm client.scm mt.scm \ - ezsteps.scm lock-queue.scm rmt.scm api.scm \ - subrun.scm portlogger.scm archive.scm env.scm \ - diff-report.scm cgisetup/models/pgdb.scm + configf.scm db.scm keys.scm margs.scm process.scm runs.scm \ + tasks.scm tests.scm genexample.scm tdb.scm mt.scm \ + ezsteps.scm lock-queue.scm api.scm subrun.scm \ + portlogger.scm archive.scm env.scm diff-report.scm \ + cgisetup/models/pgdb.scm + +# server.scm http-transport.scm client.scm rmt.scm # module source files -MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm +MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm \ + configfmod.scm servermod.scm clientmod.scm rmtmod.scm \ + artifacts.scm -all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut tcmt +all : $(PREFIX)/bin/.$(ARCHSTR) mtest dboard mtut # dbmod.import.o is just a hack here -mofiles/dbfile.o : mofiles/debugprint.o mofiles/commonmod.o dbmod.import.o +mofiles/dbfile.o : mofiles/debugprint.o mofiles/commonmod.o # dbmod.import.o db.o : dbmod.import.o mofiles/debugprint.o : mofiles/mtargs.o +mofiles/servermod.o : mofiles/artifacts.o # ftail.scm rmtmod.scm commonmod.scm removed # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm \ # mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm \ # rmtmod.scm apimod.scm @@ -258,10 +262,22 @@ $(PREFIX)/bin/mtexec : $(PREFIX)/bin/.$(ARCHSTR)/mtexec utils/mk_wrapper utils/mk_wrapper $(PREFIX) mtexec $(PREFIX)/bin/mtexec chmod a+x $(PREFIX)/bin/mtexec +# mtserv + +mtserv: $(OFILES) $(MOFILES) megatest-fossil-hash.scm mtserv.scm + csc $(CSCOPTS) $(OFILES) $(MOFILES) mtserv.scm -o mtserv + +$(PREFIX)/bin/.$(ARCHSTR)/mtserv : mtserv + $(INSTALL) mtserv $(PREFIX)/bin/.$(ARCHSTR)/mtserv + +$(PREFIX)/bin/mtserv : $(PREFIX)/bin/.$(ARCHSTR)/mtserv utils/mk_wrapper + utils/mk_wrapper $(PREFIX) mtserv $(PREFIX)/bin/mtserv + chmod a+x $(PREFIX)/bin/mtserv + # tcmt $(PREFIX)/bin/.$(ARCHSTR)/tcmt : tcmt $(INSTALL) tcmt $(PREFIX)/bin/.$(ARCHSTR)/tcmt @@ -364,18 +380,18 @@ $(PREFIX)/bin/.$(ARCHSTR)/mtexec $(PREFIX)/bin/mtexec $(PREFIX)/bin/serialize-env \ $(PREFIX)/bin/nbfind $(PREFIX)/bin/mtrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \ $(PREFIX)/bin/mt-old-to-new.sh $(PREFIX)/bin/mt-new-to-old.sh \ $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun \ $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/mtutil \ - $(PREFIX)/bin/tcmt $(PREFIX)/share/db/mt-pg.sql \ + $(PREFIX)/share/db/mt-pg.sql \ $(PREFIX)/share/js/jquery-3.1.0.slim.min.js \ $(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so \ $(PREFIX)/bin/.$(ARCHSTR)/lib/libpangox-1.0.so.0 \ $(PREFIX)/bin/.$(ARCHSTR)/lib/libxcb-xlib.so.0 # $(PREFIX)/bin/.$(ARCHSTR)/ndboard -# $(PREFIX)/bin/newdashboard +# $(PREFIX)/bin/newdashboard $(PREFIX)/bin/tcmt $(PREFIX)/bin/.$(ARCHSTR) : mkdir -p $(PREFIX)/bin/.$(ARCHSTR) mkdir -p $(PREFIX)/bin/.$(ARCHSTR)/lib Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -21,18 +21,19 @@ ;;====================================================================== (use srfi-69 posix) (declare (unit api)) -(declare (uses rmt)) +(declare (uses rmtmod)) (declare (uses db)) (declare (uses dbmod)) (declare (uses dbfile)) (declare (uses tasks)) (import dbmod) (import dbfile) +(import rmtmod) ;; allow these queries through without starting a server ;; (define api:read-only-queries '(get-key-val-pairs @@ -394,43 +395,5 @@ #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #f))) (vector #t res)))))))) -;; http-server send-response -;; api:process-request -;; db:* -;; -;; NB// Runs on the server as part of the server loop -;; -(define (api:process-request dbstruct $) ;; the $ is the request vars proc - (debug:print 4 *default-log-port* "server-id:" *server-id*) - (let* ((cmd ($ 'cmd)) - (paramsj ($ 'params)) - (key ($ 'key)) - (params (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?) - (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key) - (if (equal? key *server-id*) - (begin - (set! *api-process-request-count* (+ *api-process-request-count* 1)) - (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) - (debug:print 4 *default-log-port* "res:" res) - (if (not success) - (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) - (if (> *api-process-request-count* *max-api-process-requests*) - (set! *max-api-process-requests* *api-process-request-count*)) - (set! *api-process-request-count* (- *api-process-request-count* 1)) - ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds - ;; (rmt:dat->json-str - ;; (if (or (string? res) - ;; (list? res) - ;; (number? res) - ;; (boolean? res)) - ;; res - ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) - (db:obj->string res transport: 'http))) - (begin - (debug:print 0 *default-log-port* "Server refused to process request. Server id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) - (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http))))) - Index: artifacts/artifacts.scm ================================================================== --- artifacts/artifacts.scm +++ artifacts/artifacts.scm @@ -96,10 +96,12 @@ ;; '((foods (fruit . f) ;; (meat . m))))) ;; => "beef" ;; +;; NOTE: We call artifacts "arfs" + (module artifacts ( ;; cards, util and misc ;; sort-cards ;; calc-sha1 @@ -139,10 +141,11 @@ get-value ;; looks up a value given a key in a dartifact flatten-all ;; merge the list of values from a query which includes a artifact into a flat alist <== really useful! check-artifact ;; artifact alists +get-artifact-fname write-alist->artifact read-artifact->alist ;; archive database ;; archive-open-db @@ -199,22 +202,41 @@ ;; new artifacts db with-todays-adb get-all-artifacts refresh-artifacts-db - ) -(import (chicken base) scheme (chicken process) (chicken time posix) +(import scheme) + +(cond-expand + (chicken-5 + (import (chicken base) + (chicken process) (chicken time posix) (chicken io) (chicken file) (chicken pathname) chicken.process-context.posix (chicken string) - (chicken time) (chicken sort) (chicken file posix) (chicken condition) srfi-1 - regex srfi-13 srfi-69 (chicken port) (chicken process-context) - crypt sha1 matchable message-digest sqlite3 typed-records - directory-utils - scsh-process) + (chicken time) (chicken sort) (chicken file posix) (chicken condition) + (chicken port) (chicken process-context) + )) + (chicken-4 + (import chicken + posix + data-structures + extras + ports + files + setup-api + ) + (define file-executable? file-execute-access?)) + (else)) + (import srfi-69 srfi-1 + regex srfi-13 srfi-69 + crypt sha1 matchable message-digest sqlite3 typed-records + directory-utils + scsh-process) + ;;====================================================================== ;; DATA MANIPULATION UTILS ;;====================================================================== (define-inline (unescape-data data) @@ -1070,15 +1092,18 @@ ;;====================================================================== ;; Read/write packets to files (convience functions) ;;====================================================================== +(define (get-artifact-fname targdir uuid) + (conc targdir "/" uuid ".artifact")) + ;; write alist to a artifact file ;; (define (write-alist->artifact targdir dat #!key (artifactspec '())(ptype #f)) (let-values (((uuid artifact)(alist->artifact dat artifactspec ptype: ptype))) - (with-output-to-file (conc targdir "/" uuid ".artifact") + (with-output-to-file (get-artifact-fname targdir uuid) (lambda () (print artifact))) uuid)) ;; return the uuid ;; read artifact into alist Index: client.scm ================================================================== --- client.scm +++ client.scm @@ -28,135 +28,136 @@ (declare (uses common)) (declare (uses db)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. -(module client -* - -) - -(import client) - -(include "common_records.scm") -(include "db_records.scm") - -;; client:get-signature -(define (client:get-signature) - (if *my-client-signature* *my-client-signature* - (let ((sig (conc (get-host-name) " " (current-process-id)))) - (set! *my-client-signature* sig) - *my-client-signature*))) - -;; Not currently used! But, I think it *should* be used!!! -#;(define (client:logout serverdat) - (let ((ok (and (socket? serverdat) - (cdb:logout serverdat *toppath* (client:get-signature))))) - ok)) - -;; Do all the connection work, look up the transport type and set up the -;; connection if required. -;; -;; There are two scenarios. -;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline -;; 2. We are a run tests, list runs or other interactive process and we must figure out -;; *transport-type* and *runremote* from the monitor.db -;; -;; client:setup -;; -;; lookup_server, need to remove *runremote* stuff -;; - -;;(define (http-transport:server-dat-make-url runremote) -(define (client:get-url runremote) - (if (and (remote-iface runremote) - (remote-port runremote)) - (conc "http://" - (remote-iface runremote) - ":" - (remote-port runremote)) - #f)) - -(define (client:setup areapath runremote #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) - (mutex-lock! *rmt-mutex*) - (let ((res (client:setup-http areapath runremote remaining-tries: remaining-tries failed-connects: failed-connects area-dat: area-dat))) - (mutex-unlock! *rmt-mutex*) - res)) - -(define (client:setup-http areapath runremote #!key (remaining-tries 100) (failed-connects 0)) - (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) - (server:start-and-wait areapath) - (if (<= remaining-tries 0) - (begin - (debug:print-error 0 *default-log-port* "failed to start or connect to server") - (exit 1)) - ;; - ;; Alternatively here, we can get the list of candidate servers and work our way - ;; through them searching for a good one. - ;; - (let* ((server-dat (server:choose-server areapath 'best))) ;; list host port start-time server-id pid -;; (runremote (or area-dat *runremote*))) - (if (not server-dat) ;; no server found - (begin - (if (< remaining-tries 99)(thread-sleep! 1)) ;; obviously it needs time - (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))) - (match server-dat - ((host port start-time server-id pid) - (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) - (if (not runremote) - (begin - ;; Here we are creating a runremote where there was none or it was clobbered with #f - ;; - (set! runremote (make-remote)) - (let* ((server-info (server:check-if-running areapath))) - (remote-server-info-set! runremote server-info) - (if server-info - (begin - (remote-server-url-set! runremote (server:record->url server-info)) - (remote-server-id-set! runremote (server:record->id server-info))))))) - ;; at this point we have a runremote - (if (and host port server-id) - (let* ((nada (client:connect host port server-id runremote)) - (ping-res (rmt:login-no-auto-client-setup runremote))) - (if ping-res - (if runremote - (begin - (debug:print-info 2 *default-log-port* "connected to " (client:get-url runremote)) - runremote) - (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))) - (begin ;; login failed but have a server record, clean out the record and try again - (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... ping-res=" ping-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 - (http-transport:close-connections runremote) - (thread-sleep! 1) - (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1)) - ))) - (begin ;; no server registered - ;; (server:kind-run areapath) - (server:start-and-wait areapath) - (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) - (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. - (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))))) - (else - (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat))))))) - -;; -;; connect - stored in remote-condat -;; -;; (define (http-transport:client-connect iface port server-id runremote) -(define (client:connect iface port server-id runremote-in) - (let* ((runremote (or runremote-in - (make-runremote)))) - (debug:print-info 2 *default-log-port* "Connecting to server at "iface":"port", id "server-id) - (let* ((api-url (conc "http://" iface ":" port "/api")) - (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) - (api-req (make-request method: 'POST uri: api-uri))) - ;; (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) - (remote-iface-set! runremote iface) - (remote-port-set! runremote port) - (remote-server-id-set! runremote server-id) - (remote-connect-time-set! runremote (current-seconds)) - (remote-last-access-set! runremote (current-seconds)) - (remote-api-url-set! runremote api-url) - (remote-api-uri-set! runremote api-uri) - (remote-api-req-set! runremote api-req) - runremote))) - +;; (module client +;; * +;; +;; ) +;; +;; (import client) +;; +;; (include "common_records.scm") +;; (include "db_records.scm") +;; +;; ;; client:get-signature +;; (define (client:get-signature) +;; (if *my-client-signature* *my-client-signature* +;; (let ((sig (conc (get-host-name) " " (current-process-id)))) +;; (set! *my-client-signature* sig) +;; *my-client-signature*))) +;; +;; ;; Not currently used! But, I think it *should* be used!!! +;; #;(define (client:logout serverdat) +;; (let ((ok (and (socket? serverdat) +;; (cdb:logout serverdat *toppath* (client:get-signature))))) +;; ok)) +;; +;; ;; Do all the connection work, look up the transport type and set up the +;; ;; connection if required. +;; ;; +;; ;; There are two scenarios. +;; ;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline +;; ;; 2. We are a run tests, list runs or other interactive process and we must figure out +;; ;; *transport-type* and *runremote* from the monitor.db +;; ;; +;; ;; client:setup +;; ;; +;; ;; lookup_server, need to remove *runremote* stuff +;; ;; +;; +;; ;;(define (http-transport:server-dat-make-url runremote) +;; (define (client:get-url runremote) +;; (if (and (remote-iface runremote) +;; (remote-port runremote)) +;; (conc "http://" +;; (remote-iface runremote) +;; ":" +;; (remote-port runremote)) +;; #f)) +;; +;; (define (client:setup areapath runremote #!key (remaining-tries 100) (failed-connects 0)(area-dat #f)) +;; (mutex-lock! *rmt-mutex*) +;; (let ((res (client:setup-http areapath runremote remaining-tries: remaining-tries failed-connects: failed-connects area-dat: area-dat))) +;; (mutex-unlock! *rmt-mutex*) +;; res)) +;; +;; (define (client:setup-http areapath runremote #!key (remaining-tries 100) (failed-connects 0)) +;; (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) +;; (server:start-and-wait areapath) +;; (if (<= remaining-tries 0) +;; (begin +;; (debug:print-error 0 *default-log-port* "failed to start or connect to server") +;; (exit 1)) +;; ;; +;; ;; Alternatively here, we can get the list of candidate servers and work our way +;; ;; through them searching for a good one. +;; ;; +;; (let* ((server-dat (server:choose-server areapath 'best))) ;; list host port start-time server-id pid +;; ;; (runremote (or area-dat *runremote*))) +;; (if (not server-dat) ;; no server found +;; (begin +;; (if (< remaining-tries 99)(thread-sleep! 1)) ;; obviously it needs time +;; (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))) +;; (match server-dat +;; ((host port start-time server-id pid) +;; (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) +;; (if (not runremote) +;; (begin +;; ;; Here we are creating a runremote where there was none or it was clobbered with #f +;; ;; +;; (set! runremote (make-remote)) +;; (let* ((server-info (server:check-if-running areapath))) +;; (remote-server-info-set! runremote server-info) +;; (if server-info +;; (begin +;; (remote-server-url-set! runremote (server:record->url server-info)) +;; (remote-server-id-set! runremote (server:record->id server-info))))))) +;; ;; at this point we have a runremote +;; (if (and host port server-id) +;; (let* ((nada (client:connect host port server-id runremote)) +;; (ping-res (rmt:login-no-auto-client-setup runremote))) +;; (if ping-res +;; (if runremote +;; (begin +;; (debug:print-info 2 *default-log-port* "connected to " (client:get-url runremote)) +;; runremote) +;; (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))) +;; (begin ;; login failed but have a server record, clean out the record and try again +;; (debug:print-info 0 *default-log-port* "client:setup, login unsuccessful, will attempt to start server ... ping-res=" ping-res ", server-dat=" server-dat) ;; had runid. Fixes part of Randy;s ticket 1405717332 +;; (http-transport:close-connections runremote) +;; (thread-sleep! 1) +;; (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1)) +;; ))) +;; (begin ;; no server registered +;; ;; (server:kind-run areapath) +;; (server:start-and-wait areapath) +;; (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) +;; (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. +;; (client:setup-http areapath runremote remaining-tries: (- remaining-tries 1))))) +;; (else +;; (debug:print 0 *default-log-port* "ERROR: malformed server-dat="server-dat))))))) +;; +;; ;; +;; ;; connect - stored in remote-condat +;; ;; +;; ;; (define (http-transport:client-connect iface port server-id runremote) +;; (define (client:connect iface port server-id runremote-in) +;; (let* ((runremote (or runremote-in +;; (make-runremote)))) +;; (debug:print-info 2 *default-log-port* "Connecting to server at "iface":"port", id "server-id) +;; (let* ((api-url (conc "http://" iface ":" port "/api")) +;; (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) +;; (api-req (make-request method: 'POST uri: api-uri))) +;; ;; (server-dat (vector iface port api-uri api-url api-req (current-seconds) server-id))) +;; (remote-iface-set! runremote iface) +;; (remote-port-set! runremote port) +;; (remote-server-id-set! runremote server-id) +;; (remote-connect-time-set! runremote (current-seconds)) +;; (remote-last-access-set! runremote (current-seconds)) +;; (remote-api-url-set! runremote api-url) +;; (remote-api-uri-set! runremote api-uri) +;; (remote-api-req-set! runremote api-req) +;; runremote))) +;; +;; ADDED clientmod.scm Index: clientmod.scm ================================================================== --- /dev/null +++ clientmod.scm @@ -0,0 +1,80 @@ + +;; Copyright 2006-2012, Matthew Welland. +;; +;; This file is part of Megatest. +;; +;; Megatest is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. +;; +;; Megatest is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with Megatest. If not, see . + +;;====================================================================== +;; C L I E N T S +;;====================================================================== + +;;(use srfi-18 extras tcp s11n srfi-1 posix regex srfi-69 hostinfo md5 +;; message-digest matchable spiffy uri-common intarweb http-client +;; spiffy-request-vars uri-common intarweb directory-utils) + +(declare (unit clientmod)) +(declare (uses servermod)) +(declare (uses artifacts)) + +(module clientmod +* + +(import scheme + posix + data-structures + srfi-18 + typed-records + + artifacts + servermod + ) + +(defstruct con ;; client connection + (hdir #f) + (obj-to-str #f) + (host #f) + (pid #f) + (sdat #f) ;; server artifact data + ) + +(define (client:find-server areapath) + (let* ((sdir (conc areapath"/.server")) + (sarfs (glob (conc sdir"/*.artifact")))) ;; server artifacts + (if (null? sarfs) + (begin + (server:launch areapath) + (thread-sleep! 1) + (client:find-server areapath)) + (let* ((sarf (car sarfs)) + (sdat (read-artifact->alist sarf)) + (hdir (alist-ref 'd sdat))) + (make-con hdir: hdir sdat: sdat))))) + +(define (client:send-receive con cmd params) + (let* ((obj->string (con-obj-to-str con)) + (arf `((c . ,cmd) + (p . ,(obj->string params)) + (h . ,(con-host con)) + (i . ,(con-pid con)))) + (hdir (con-hdir con)) + (uuid (write-alist->artifact hdir arf ptype: 'Q))) + + ;; wait for a response here + + #f + )) + +) + Index: dashboard-context-menu.scm ================================================================== --- dashboard-context-menu.scm +++ dashboard-context-menu.scm @@ -34,11 +34,11 @@ (declare (unit dashboard-context-menu)) (declare (uses common)) (declare (uses db)) (declare (uses gutils)) -(declare (uses rmt)) +(declare (uses rmtmod)) (declare (uses ezsteps)) ;; (declare (uses sdb)) ;; (declare (uses filedb)) (declare (uses subrun)) Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -33,11 +33,11 @@ (declare (unit dashboard-tests)) (declare (uses common)) (declare (uses db)) (declare (uses gutils)) -(declare (uses rmt)) +(declare (uses rmtmod)) (declare (uses ezsteps)) ;; (declare (uses sdb)) ;; (declare (uses filedb)) (declare (uses subrun)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -51,11 +51,11 @@ (declare (uses dbmod)) ;; (declare (uses debugprint)) (declare (uses dbfile)) (declare (uses keys)) (declare (uses ods)) -(declare (uses client)) +;; (declare (uses client)) (declare (uses mt)) (include "common_records.scm") (include "db_records.scm") (include "key_records.scm") Index: diff-report.scm ================================================================== --- diff-report.scm +++ diff-report.scm @@ -16,11 +16,11 @@ ;; along with Megatest. If not, see . ;; (declare (unit diff-report)) (declare (uses common)) -(declare (uses rmt)) +(declare (uses rmtmod)) (include "common_records.scm") (use matchable) (use fmt) (use ducttape-lib) Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -1,6 +1,5 @@ - ;; Copyright 2006-2012, Matthew Welland. ;; ;; This file is part of Megatest. ;; ;; Megatest is free software: you can redistribute it and/or modify @@ -14,680 +13,720 @@ ;; GNU General Public License for more details. ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see . -(require-extension (srfi 18) extras tcp s11n) - - -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest posix-extras) - -(use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing) - -;; Configurations for server -(tcp-buffer-size 2048) -(max-connections 2048) - +;; (require-extension (srfi 18) extras tcp s11n) +;; +;; +;; (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest posix-extras) +;; +;; (use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing) +;; +;; ;; Configurations for server +;; (tcp-buffer-size 2048) +;; (max-connections 2048) +;; (declare (unit http-transport)) - -(declare (uses common)) -(declare (uses db)) -(declare (uses tests)) -(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. -(declare (uses server)) -;; (declare (uses daemon)) -(declare (uses portlogger)) -(declare (uses rmt)) -(declare (uses dbfile)) -(declare (uses commonmod)) - -(include "common_records.scm") -(include "db_records.scm") -(include "js-path.scm") - -(import dbfile commonmod) - -(require-library stml) -(define (http-transport:make-server-url hostport) - (if (not hostport) - #f - (conc "http://" (car hostport) ":" (cadr hostport)))) - -(define *server-loop-heart-beat* (current-seconds)) - -;;====================================================================== -;; S E R V E R -;; ====================================================================== - -;; Call this to start the actual server -;; - -(define *db:process-queue-mutex* (make-mutex)) - -(define (http-transport:run hostn) - ;; Configurations for server - (tcp-buffer-size 2048) - (max-connections 2048) - (debug:print 2 *default-log-port* "Attempting to start the server ...") - (let* ((db #f) ;; (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily - (hostname (get-host-name)) - (ipaddrstr (let ((ipstr (if (string=? "-" hostn) - ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") - (server:get-best-guess-address hostname) - #f))) - (if ipstr ipstr hostn))) ;; hostname))) - (start-port (portlogger:open-run-close portlogger:find-port)) - (link-tree-path (common:get-linktree)) - (tmp-area (common:get-db-tmp-area)) - (start-file (conc tmp-area "/.server-start"))) - (debug:print-info 0 *default-log-port* "portlogger recommended port: " start-port) - ;; set some parameters for the server - (root-path (if link-tree-path - link-tree-path - (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP! - (handle-directory spiffy-directory-listing) - (handle-exception (lambda (exn chain) - (signal (make-composite-condition - (make-property-condition - 'server - 'message "server error"))))) - - ;; http-transport:handle-directory) ;; simple-directory-handler) - ;; Setup the web server and a /ctrl interface - ;; - (vhost-map `(((* any) . ,(lambda (continue) - ;; open the db on the first call - ;; This is were we set up the database connections - (let* (($ (request-vars source: 'both)) - (dat ($ 'dat)) - (res #f)) - (cond - ((equal? (uri-path (request-uri (current-request))) - '(/ "api")) - (send-response body: (api:process-request *dbstruct-dbs* $) ;; the $ is the request vars proc - headers: '((content-type text/plain))) - (mutex-lock! *heartbeat-mutex*) - (set! *db-last-access* (current-seconds)) - (mutex-unlock! *heartbeat-mutex*)) - ((equal? (uri-path (request-uri (current-request))) - '(/ "")) - (send-response body: (http-transport:main-page))) - ((equal? (uri-path (request-uri (current-request))) - '(/ "json_api")) - (send-response body: (http-transport:main-page))) - ((equal? (uri-path (request-uri (current-request))) - '(/ "runs")) - (send-response body: (http-transport:main-page))) - ((equal? (uri-path (request-uri (current-request))) - '(/ any)) - (send-response body: "hey there!\n" - headers: '((content-type text/plain)))) - ((equal? (uri-path (request-uri (current-request))) - '(/ "hey")) - (send-response body: "hey there!\n" - headers: '((content-type text/plain)))) - ((equal? (uri-path (request-uri (current-request))) - '(/ "jquery3.1.0.js")) - (send-response body: (http-transport:show-jquery) - headers: '((content-type application/javascript)))) - ((equal? (uri-path (request-uri (current-request))) - '(/ "test_log")) - (send-response body: (http-transport:html-test-log $) - headers: '((content-type text/HTML)))) - ((equal? (uri-path (request-uri (current-request))) - '(/ "dashboard")) - (send-response body: (http-transport:html-dboard $) - headers: '((content-type text/HTML)))) - (else (continue)))))))) - (handle-exceptions - exn - (debug:print 0 *default-log-port* "Failed to create file " start-file ", exn=" exn) - (with-output-to-file start-file (lambda ()(print (current-process-id))))) - (http-transport:try-start-server ipaddrstr start-port))) - -;; This is recursively run by http-transport:run until sucessful -;; -(define (http-transport:try-start-server ipaddrstr portnum) - (let ((config-hostname (configf:lookup *configdat* "server" "hostname")) - (config-use-proxy (equal? (configf:lookup *configdat* "client" "use-http_proxy") "yes"))) - (if (not config-use-proxy) - (determine-proxy (constantly #f))) - (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname) - (handle-exceptions - exn - (begin - ;; (print-error-message exn) - (if (< portnum 64000) - (begin - (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...") - (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 5 *default-log-port* "exn=" (condition->list exn)) - (portlogger:open-run-close portlogger:set-failed portnum) - (debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port") - (thread-sleep! 0.1) - - ;; get_next_port goes here - (http-transport:try-start-server ipaddrstr - (portlogger:open-run-close portlogger:find-port))) - (begin - (debug:print 0 *default-log-port* "ERROR: Tried and tried but could not start the server")))) - ;; any error in following steps will result in a retry - (set! *server-info* (list ipaddrstr portnum)) - (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum) - ;; This starts the spiffy server - ;; NEED WAY TO SET IP TO #f TO BIND ALL - ;; (start-server bind-address: ipaddrstr port: portnum) - (if config-hostname ;; this is a hint to bind directly - (start-server port: portnum) ;; bind-address: (if (equal? config-hostname "-") - ;; ipaddrstr - ;; config-hostname)) - (start-server port: portnum)) - (portlogger:open-run-close portlogger:set-port portnum "released") - (debug:print 1 *default-log-port* "INFO: server has been stopped")))) - -;;====================================================================== -;; S E R V E R U T I L I T I E S -;;====================================================================== - -;;====================================================================== -;; C L I E N T S -;;====================================================================== - -(define *http-mutex* (make-mutex)) - -;; NOTE: Large block of code from 32436b426188080f72fceb6894af541fbad9921e removed here -;; I'm pretty sure it is defunct. - -;; This next block all imported en-mass from the api branch -(define *http-requests-in-progress* 0) -(define *http-connections-next-cleanup* (current-seconds)) - -(define (http-transport:get-time-to-cleanup) - (let ((res #f)) - (mutex-lock! *http-mutex*) - (set! res (> (current-seconds) *http-connections-next-cleanup*)) - (mutex-unlock! *http-mutex*) - res)) - -(define (http-transport:inc-requests-count) - (mutex-lock! *http-mutex*) - (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*)) - ;; Use this opportunity to slow things down iff there are too many requests in flight - (if (> *http-requests-in-progress* 5) - (begin - (debug:print-info 0 *default-log-port* "Whoa there buddy, ease up...") - (thread-sleep! 1))) - (mutex-unlock! *http-mutex*)) - -(define (http-transport:dec-requests-count proc) - (mutex-lock! *http-mutex*) - (proc) - (set! *http-requests-in-progress* (- *http-requests-in-progress* 1)) - (mutex-unlock! *http-mutex*)) - -(define (http-transport:dec-requests-count-and-close-all-connections) - (set! *http-requests-in-progress* (- *http-requests-in-progress* 1)) - (let loop ((etime (+ (current-seconds) 5))) ;; give up in five seconds - (if (> *http-requests-in-progress* 0) - (if (> etime (current-seconds)) - (begin - (thread-sleep! 0.05) - (loop etime)) - (debug:print-error 0 *default-log-port* "requests still in progress after 5 seconds of waiting. I'm going to pass on cleaning up http connections")) - (close-all-connections!))) - (set! *http-connections-next-cleanup* (+ (current-seconds) 10)) - (mutex-unlock! *http-mutex*)) - -(define (http-transport:inc-requests-and-prep-to-close-all-connections) - (mutex-lock! *http-mutex*) - (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*))) - -;; Send "cmd" with json payload "params" to serverdat and receive result -;; -(define (http-transport:client-api-send-receive run-id runremote cmd params #!key (numretries 3)) - (assert (remote? runremote) "FATAL: http-transport:client-api-send-receive called with serverdat="serverdat) - (let* ((fullurl (remote-api-req runremote)) - (res (vector #f "uninitialized")) - (success #t) - (sparams (db:obj->string params transport: 'http)) - (server-id (remote-server-id runremote))) - (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) - - ;; set up the http-client here - (max-retry-attempts 1) - ;; consider all requests indempotent - (retry-request? (lambda (request) - #f)) - ;; send the data and get the response - ;; extract the needed info from the http data and - ;; process and return it. - (let* ((send-recieve (lambda () - (mutex-lock! *http-mutex*) - ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) - ;; ((exn http client-error) e (print e))) - (set! res (vector ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time. - success - (db:string->obj - (handle-exceptions - exn - (let ((call-chain (get-call-chain)) - (msg ((condition-property-accessor 'exn 'message) exn))) - (set! success #f) - (if (debug:debug-mode 1) - (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...") - (begin - (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") - (debug:print 0 *default-log-port* " message: " msg ", exn=" exn) - (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey")) - (debug:print 0 *default-log-port* " call-chain: " call-chain))) - ;; what if another thread is communicating ok? Can't happen due to mutex - (http-transport:close-connections runremote) - (mutex-unlock! *http-mutex*) - ;; (close-connection! fullurl) - (db:obj->string #f)) - (with-input-from-request ;; was dat - fullurl - (list (cons 'key (or server-id "thekey")) - (cons 'cmd cmd) - (cons 'params sparams)) - read-string)) - transport: 'http) - 0)) ;; added this speculatively - ;; Shouldn't this be a call to the managed call-all-connections stuff above? - ;; (close-all-connections!) ;; BUG? WHY IS THIS HERE? Are we failing to reuse connections? - (mutex-unlock! *http-mutex*) - )) - (time-out (lambda () - (thread-sleep! 45) - (debug:print 0 *default-log-port* "WARNING: send-receive took more than 45 seconds!!") - #f)) - (th1 (make-thread send-recieve "with-input-from-request")) - (th2 (make-thread time-out "time out"))) - (thread-start! th1) - (thread-start! th2) - (thread-join! th1) - (vector-set! res 0 success) - (thread-terminate! th2) - (if (vector? res) - (if (vector-ref res 0) ;; this is the first flag or the second flag? - (let* ((res-dat (vector-ref res 1))) - (if (and (string? res-dat) (string-contains res-dat "server-id mismatch")) - (signal (make-composite-condition - (make-property-condition - 'servermismatch - 'message (vector-ref res 1)))) - res)) ;; this is the *inner* vector? seriously? why? - (if (debug:debug-mode 11) - (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it - (print-call-chain (current-error-port)) - (debug:print-error 11 *default-log-port* "error above occured at server, res=" res) ;; " message: " ((condition-property-accessor 'exn 'message) exn)) - (debug:print 11 *default-log-port* " server call chain:") - (pp (vector-ref res 1) (current-error-port)) - (signal (vector-ref res 0))) - res)) - (signal (make-composite-condition - (make-property-condition - 'timeout - 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) - -;; careful closing of connections stored in *runremote* -;; -(define (http-transport:close-connections runremote) - (if (remote? runremote) - (let ((api-dat (remote-api-uri runremote))) - (handle-exceptions - exn - (begin - (print-call-chain *default-log-port*) - (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) - (if (args:any-defined? "-server" "-execute" "-run") - (debug:print-info 0 *default-log-port* "Closing connections to "api-dat)) - (if api-dat (close-connection! api-dat)) - (remote-conndat-set! runremote #f) - #t)) - #f)) - -;; run http-transport:keep-running in a parallel thread to monitor that the db is being -;; used and to shutdown after sometime if it is not. -;; -(define (http-transport:keep-running) - ;; if none running or if > 20 seconds since - ;; server last used then start shutdown - ;; This thread waits for the server to come alive - (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") - (let* ((servinfofile #f) - (sdat #f) - (no-sync-db (db:open-no-sync-db)) - (tmp-area (common:get-db-tmp-area)) - (started-file (conc tmp-area "/.server-started")) - (server-start-time (current-seconds)) - (server-info (let loop ((start-time (current-seconds)) - (changed #t) - (last-sdat "not this")) - (begin ;; let ((sdat #f)) - (thread-sleep! 0.01) - (debug:print-info 0 *default-log-port* "Waiting for server alive signature") - (mutex-lock! *heartbeat-mutex*) - (set! sdat *server-info*) - (mutex-unlock! *heartbeat-mutex*) - (if (and sdat - (not changed) - (> (- (current-seconds) start-time) 2)) - (let* ((servinfodir (server:get-servinfo-dir *toppath*)) ;; (conc *toppath*"/.servinfo")) - (ipaddr (car sdat)) - (port (cadr sdat)) - (servinf (conc servinfodir"/"ipaddr":"port))) - (set! servinfofile servinf) - (if (not (file-exists? servinfodir)) - (create-directory servinfodir #t)) - (with-output-to-file servinf - (lambda () - (let* ((serv-id (server:mk-signature))) - (set! *server-id* serv-id) - (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)) - (print "started: "(seconds->year-week/day-time (current-seconds)))))) - (set! *on-exit-procs* (cons - (lambda () - (delete-file* servinf)) - *on-exit-procs*)) - ;; put data about this server into a simple flat file host.port - (debug:print-info 0 *default-log-port* "Received server alive signature") - sdat) - (begin - (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) - (sleep 4) - (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes - (if sdat - (let* ((ipaddr (car sdat)) - (port (cadr sdat)) - (servinf (conc (server:get-servinfo-dir *toppath*)"/"ipaddr":"port))) - (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") - (exit)) - (exit) - ) - (loop start-time - (equal? sdat last-sdat) - sdat))))))) - (iface (car server-info)) - (port (cadr server-info)) - (last-access 0) - (server-timeout (server:expiration-timeout)) - (server-going #f) - (server-log-file (args:get-arg "-log"))) ;; always set when we are a server - - (handle-exceptions - exn - (debug:print 0 *default-log-port* "Failed to create " started-file ", exn=" exn) - (with-output-to-file started-file (lambda ()(print (current-process-id))))) - - (let loop ((count 0) - (server-state 'available) - (bad-sync-count 0) - (start-time (current-milliseconds))) - - ;; Use this opportunity to sync the tmp db to megatest.db - (if (not server-going) ;; *dbstruct-dbs* - (begin - (debug:print 0 *default-log-port* "SERVER: dbprep") - (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! - (set! server-going #t) - (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. - (if (and no-sync-db - (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :) - (begin - (if (common:low-noise-print 120 "sync-all-print") - (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))) - (db:all-db-sync *dbstruct-dbs*) - ))) - - ;; when things go wrong we don't want to be doing the various queries too often - ;; so we strive to run this stuff only every four seconds or so. - (let* ((sync-time (- (current-milliseconds) start-time)) - (rem-time (quotient (- 4000 sync-time) 1000))) - (if (and (<= rem-time 4) - (> rem-time 0)) - (thread-sleep! rem-time))) - - (if (< count 1) ;; 3x3 = 9 secs aprox - (loop (+ count 1) 'running bad-sync-count (current-milliseconds))) - - ;; Check that iface and port have not changed (can happen if server port collides) - (mutex-lock! *heartbeat-mutex*) - (set! sdat *server-info*) - (mutex-unlock! *heartbeat-mutex*) - - (if (not (equal? sdat (list iface port))) - (let ((new-iface (car sdat)) - (new-port (cadr sdat))) - (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") - (set! iface new-iface) - (set! port new-port) - (if (not *server-id*) - (set! *server-id* (server:mk-signature))) - (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) - (flush-output *default-log-port*))) - - ;; Transfer *db-last-access* to last-access to use in checking that we are still alive - (mutex-lock! *heartbeat-mutex*) - (set! last-access *db-last-access*) - (mutex-unlock! *heartbeat-mutex*) - - (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) - (begin - (if (not *server-id*) - (set! *server-id* (server:mk-signature))) - (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) - (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) - (flush-output *default-log-port*))) - (if (common:low-noise-print 60 "dbstats") - (begin - (debug:print 0 *default-log-port* "Server stats:") - (db:print-current-query-stats))) - (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) - (cond - ((and *server-run* - (> (+ last-access server-timeout) - (current-seconds))) - (if (common:low-noise-print 120 "server continuing") - (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) - (let ((curr-time (current-seconds))) - (handle-exceptions - exn - (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn) - (if (and ;; (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter - (not *server-overloaded*) - (file-exists? servinfofile)) - (change-file-times servinfofile curr-time curr-time))) - (if (and (common:low-noise-print 120 "start new server") - (> *api-process-request-count* 50)) ;; if this server is kind of busy start up another - (begin - (debug:print-info 0 *default-log-port* "Server is busy, api-count "*api-process-request-count*", start another if possible...") - (server:kind-run *toppath*) - (if (> *api-process-request-count* 100) - (begin - (debug:print-info 0 *default-log-port* "Server is overloaded at api-count=" *api-process-request-count*", removing "servinfofile) - (delete-file* servinfofile))))))) - (loop 0 server-state bad-sync-count (current-milliseconds))) - (else - (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) - (http-transport:server-shutdown port))))))) - -(define (http-transport:server-shutdown port) - (begin - ;;(BB> "http-transport:server-shutdown called") - (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) - ;; - ;; start_shutdown - ;; - (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up - (portlogger:open-run-close portlogger:set-port port "released") - (thread-sleep! 1) - - ;; (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) - ;; (debug:print-info 0 *default-log-port* "Number of cached writes " *number-of-writes*) - ;; (debug:print-info 0 *default-log-port* "Average cached write time " - ;; (if (eq? *number-of-writes* 0) - ;; "n/a (no writes)" - ;; (/ *writes-total-delay* - ;; *number-of-writes*)) - ;; " ms") - ;; (debug:print-info 0 *default-log-port* "Number non-cached queries " *number-non-write-queries*) - ;; (debug:print-info 0 *default-log-port* "Average non-cached time " - ;; (if (eq? *number-non-write-queries* 0) - ;; "n/a (no queries)" - ;; (/ *total-non-write-delay* - ;; *number-non-write-queries*)) - ;; " ms") - - (db:print-current-query-stats) - #;(common:save-pkt `((action . exit) - (T . server) - (pid . ,(current-process-id))) - *configdat* #t) - - ;; remove .servinfo file(s) here - - (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") - (exit))) - -;; all routes though here end in exit ... -;; -;; start_server? -;; -(define (http-transport:launch) - ;; check the .servinfo directory, are there other servers running on this - ;; or another host? - (let* ((server-start-is-ok (server:minimal-check *toppath*))) - (if (not server-start-is-ok) - (begin - (debug:print 0 *default-log-port* "ERROR: server start not ok, exiting now.") - (exit 1)))) - - ;; check that a server start is in progress, pause or exit if so - (let* ((th2 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server run thread started") - (http-transport:run - (if (args:get-arg "-server") - (args:get-arg "-server") - "-") - )) "Server run")) - (th3 (make-thread (lambda () - (debug:print-info 0 *default-log-port* "Server monitor thread started") - (http-transport:keep-running) - "Keep running")))) - (thread-start! th2) - (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. - (thread-start! th3) - (set! *didsomething* #t) - (thread-join! th2) - (exit))) - -;; (define (http-transport:server-signal-handler signum) -;; (signal-mask! signum) -;; (handle-exceptions -;; exn -;; (debug:print 0 *default-log-port* " ... exiting ...") -;; (let ((th1 (make-thread (lambda () -;; (thread-sleep! 1)) -;; "eat response")) -;; (th2 (make-thread (lambda () -;; (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.") -;; (thread-sleep! 3) ;; give the flush three seconds to do it's stuff -;; (debug:print 0 *default-log-port* " Done.") -;; (exit 4)) -;; "exit on ^C timer"))) -;; (thread-start! th2) -;; (thread-start! th1) -;; (thread-join! th2)))) - -;;=============================================== -;; Java script -;;=============================================== -(define (http-transport:show-jquery) - (let* ((data (tests:readlines *java-script-lib*))) -(string-join data "\n"))) - - - -;;====================================================================== -;; web pages -;;====================================================================== - -(define (http-transport:html-test-log $) - (let* ((run-id ($ 'runid)) - (test-item ($ 'testname)) - (parts (string-split test-item ":")) - (test-name (car parts)) - - (item-name (if (equal? (length parts) 1) - "" - (cadr parts)))) - ;(print $) -(tests:get-test-log run-id test-name item-name))) - - -(define (http-transport:html-dboard $) - (let* ((page ($ 'page)) - (oup (open-output-string)) - (bdy "--------------------------") - - (ret (tests:dynamic-dboard page))) - (s:output-new oup ret) - (close-output-port oup) - - (set! bdy (get-output-string oup)) - (conc "

Dashboard

" bdy "

" ))) - -(define (http-transport:main-page) - (let ((linkpath (root-path))) - (conc "

" (pathname-strip-directory *toppath*) "

" - "" - "Run area: " *toppath* - "

Server Stats

" - (http-transport:stats-table) - "
" - (http-transport:runs linkpath) - "
" - ;; (http-transport:run-stats) - "" - ))) - -(define (http-transport:stats-table) - (mutex-lock! *heartbeat-mutex*) - (let ((res - (conc "" - ;; "" - "" - "" - "" - ;; "" - "" - "
Max cached queries " *max-cache-size* "
Number of cached writes " *number-of-writes* "
Average cached write time " (if (eq? *number-of-writes* 0) - "n/a (no writes)" - (/ *writes-total-delay* - *number-of-writes*)) - " ms
Number non-cached queries " *number-non-write-queries* "
Average non-cached time " (if (eq? *number-non-write-queries* 0) - ;; "n/a (no queries)" - ;; (/ *total-non-write-delay* - ;; *number-non-write-queries*)) - " ms
Last access" (seconds->time-string *db-last-access*) "
"))) - (mutex-unlock! *heartbeat-mutex*) - res)) - -(define (http-transport:runs linkpath) - (conc "

Runs

" - (string-intersperse - (let ((files (map pathname-strip-directory (glob (conc linkpath "/*"))))) - (map (lambda (p) - (conc "" p "
")) - files)) - " "))) - -#;(define (http-transport:run-stats) - (let ((stats (open-run-close db:get-running-stats #f))) - (conc "" - (string-intersperse - (map (lambda (stat) - (conc "")) - stats) - " ") - "
" (car stat) "" (cadr stat) "
"))) +;; +;; (declare (uses common)) +;; (declare (uses db)) +;; (declare (uses tests)) +;; (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. +;; (declare (uses server)) +;; ;; (declare (uses daemon)) +;; (declare (uses portlogger)) +;; (declare (uses rmt)) +;; (declare (uses dbfile)) +;; (declare (uses commonmod)) +;; +;; (include "common_records.scm") +;; (include "db_records.scm") +;; (include "js-path.scm") +;; +;; (import dbfile commonmod) +;; +;; (require-library stml) +;; (define (http-transport:make-server-url hostport) +;; (if (not hostport) +;; #f +;; (conc "http://" (car hostport) ":" (cadr hostport)))) +;; +;; (define *server-loop-heart-beat* (current-seconds)) +;; +;; ;;====================================================================== +;; ;; S E R V E R +;; ;; ====================================================================== +;; +;; ;; Call this to start the actual server +;; ;; +;; +;; (define *db:process-queue-mutex* (make-mutex)) +;; +;; (define (http-transport:run hostn) +;; ;; Configurations for server +;; (tcp-buffer-size 2048) +;; (max-connections 2048) +;; (debug:print 2 *default-log-port* "Attempting to start the server ...") +;; (let* ((db #f) ;; (open-db)) ;; we don't want the server to be opening and closing the db unnecesarily +;; (hostname (get-host-name)) +;; (ipaddrstr (let ((ipstr (if (string=? "-" hostn) +;; ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") +;; (server:get-best-guess-address hostname) +;; #f))) +;; (if ipstr ipstr hostn))) ;; hostname))) +;; (start-port (portlogger:open-run-close portlogger:find-port)) +;; (link-tree-path (common:get-linktree)) +;; (tmp-area (common:get-db-tmp-area)) +;; (start-file (conc tmp-area "/.server-start"))) +;; (debug:print-info 0 *default-log-port* "portlogger recommended port: " start-port) +;; ;; set some parameters for the server +;; (root-path (if link-tree-path +;; link-tree-path +;; (current-directory))) ;; WARNING: SECURITY HOLE. FIX ASAP! +;; (handle-directory spiffy-directory-listing) +;; (handle-exception (lambda (exn chain) +;; (signal (make-composite-condition +;; (make-property-condition +;; 'server +;; 'message "server error"))))) +;; +;; ;; http-transport:handle-directory) ;; simple-directory-handler) +;; ;; Setup the web server and a /ctrl interface +;; ;; +;; (vhost-map `(((* any) . ,(lambda (continue) +;; ;; open the db on the first call +;; ;; This is were we set up the database connections +;; (let* (($ (request-vars source: 'both)) +;; (dat ($ 'dat)) +;; (res #f)) +;; (cond +;; ((equal? (uri-path (request-uri (current-request))) +;; '(/ "api")) +;; (send-response body: (api:process-request *dbstruct-dbs* $) ;; the $ is the request vars proc +;; headers: '((content-type text/plain))) +;; (mutex-lock! *heartbeat-mutex*) +;; (set! *db-last-access* (current-seconds)) +;; (mutex-unlock! *heartbeat-mutex*)) +;; ((equal? (uri-path (request-uri (current-request))) +;; '(/ "")) +;; (send-response body: (http-transport:main-page))) +;; ((equal? (uri-path (request-uri (current-request))) +;; '(/ "json_api")) +;; (send-response body: (http-transport:main-page))) +;; ((equal? (uri-path (request-uri (current-request))) +;; '(/ "runs")) +;; (send-response body: (http-transport:main-page))) +;; ((equal? (uri-path (request-uri (current-request))) +;; '(/ any)) +;; (send-response body: "hey there!\n" +;; headers: '((content-type text/plain)))) +;; ((equal? (uri-path (request-uri (current-request))) +;; '(/ "hey")) +;; (send-response body: "hey there!\n" +;; headers: '((content-type text/plain)))) +;; ((equal? (uri-path (request-uri (current-request))) +;; '(/ "jquery3.1.0.js")) +;; (send-response body: (http-transport:show-jquery) +;; headers: '((content-type application/javascript)))) +;; ((equal? (uri-path (request-uri (current-request))) +;; '(/ "test_log")) +;; (send-response body: (http-transport:html-test-log $) +;; headers: '((content-type text/HTML)))) +;; ((equal? (uri-path (request-uri (current-request))) +;; '(/ "dashboard")) +;; (send-response body: (http-transport:html-dboard $) +;; headers: '((content-type text/HTML)))) +;; (else (continue)))))))) +;; (handle-exceptions +;; exn +;; (debug:print 0 *default-log-port* "Failed to create file " start-file ", exn=" exn) +;; (with-output-to-file start-file (lambda ()(print (current-process-id))))) +;; (http-transport:try-start-server ipaddrstr start-port))) +;; +;; ;; This is recursively run by http-transport:run until sucessful +;; ;; +;; (define (http-transport:try-start-server ipaddrstr portnum) +;; (let ((config-hostname (configf:lookup *configdat* "server" "hostname")) +;; (config-use-proxy (equal? (configf:lookup *configdat* "client" "use-http_proxy") "yes"))) +;; (if (not config-use-proxy) +;; (determine-proxy (constantly #f))) +;; (debug:print-info 0 *default-log-port* "http-transport:try-start-server time=" (seconds->time-string (current-seconds)) " ipaddrsstr=" ipaddrstr " portnum=" portnum " config-hostname=" config-hostname) +;; (handle-exceptions +;; exn +;; (begin +;; ;; (print-error-message exn) +;; (if (< portnum 64000) +;; (begin +;; (debug:print 0 *default-log-port* "WARNING: attempt to start server failed. Trying again ...") +;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) +;; (debug:print 5 *default-log-port* "exn=" (condition->list exn)) +;; (portlogger:open-run-close portlogger:set-failed portnum) +;; (debug:print 0 *default-log-port* "WARNING: failed to start on portnum: " portnum ", trying next port") +;; (thread-sleep! 0.1) +;; +;; ;; get_next_port goes here +;; (http-transport:try-start-server ipaddrstr +;; (portlogger:open-run-close portlogger:find-port))) +;; (begin +;; (debug:print 0 *default-log-port* "ERROR: Tried and tried but could not start the server")))) +;; ;; any error in following steps will result in a retry +;; (set! *server-info* (list ipaddrstr portnum)) +;; (debug:print 0 *default-log-port* "INFO: Trying to start server on " ipaddrstr ":" portnum) +;; ;; This starts the spiffy server +;; ;; NEED WAY TO SET IP TO #f TO BIND ALL +;; ;; (start-server bind-address: ipaddrstr port: portnum) +;; (if config-hostname ;; this is a hint to bind directly +;; (start-server port: portnum) ;; bind-address: (if (equal? config-hostname "-") +;; ;; ipaddrstr +;; ;; config-hostname)) +;; (start-server port: portnum)) +;; (portlogger:open-run-close portlogger:set-port portnum "released") +;; (debug:print 1 *default-log-port* "INFO: server has been stopped")))) +;; +;; ;;====================================================================== +;; ;; S E R V E R U T I L I T I E S +;; ;;====================================================================== +;; +;; ;;====================================================================== +;; ;; C L I E N T S +;; ;;====================================================================== +;; +;; (define *http-mutex* (make-mutex)) +;; +;; ;; NOTE: Large block of code from 32436b426188080f72fceb6894af541fbad9921e removed here +;; ;; I'm pretty sure it is defunct. +;; +;; ;; This next block all imported en-mass from the api branch +;; (define *http-requests-in-progress* 0) +;; (define *http-connections-next-cleanup* (current-seconds)) +;; +;; (define (http-transport:get-time-to-cleanup) +;; (let ((res #f)) +;; (mutex-lock! *http-mutex*) +;; (set! res (> (current-seconds) *http-connections-next-cleanup*)) +;; (mutex-unlock! *http-mutex*) +;; res)) +;; +;; (define (http-transport:inc-requests-count) +;; (mutex-lock! *http-mutex*) +;; (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*)) +;; ;; Use this opportunity to slow things down iff there are too many requests in flight +;; (if (> *http-requests-in-progress* 5) +;; (begin +;; (debug:print-info 0 *default-log-port* "Whoa there buddy, ease up...") +;; (thread-sleep! 1))) +;; (mutex-unlock! *http-mutex*)) +;; +;; (define (http-transport:dec-requests-count proc) +;; (mutex-lock! *http-mutex*) +;; (proc) +;; (set! *http-requests-in-progress* (- *http-requests-in-progress* 1)) +;; (mutex-unlock! *http-mutex*)) +;; +;; (define (http-transport:dec-requests-count-and-close-all-connections) +;; (set! *http-requests-in-progress* (- *http-requests-in-progress* 1)) +;; (let loop ((etime (+ (current-seconds) 5))) ;; give up in five seconds +;; (if (> *http-requests-in-progress* 0) +;; (if (> etime (current-seconds)) +;; (begin +;; (thread-sleep! 0.05) +;; (loop etime)) +;; (debug:print-error 0 *default-log-port* "requests still in progress after 5 seconds of waiting. I'm going to pass on cleaning up http connections")) +;; (close-all-connections!))) +;; (set! *http-connections-next-cleanup* (+ (current-seconds) 10)) +;; (mutex-unlock! *http-mutex*)) +;; +;; (define (http-transport:inc-requests-and-prep-to-close-all-connections) +;; (mutex-lock! *http-mutex*) +;; (set! *http-requests-in-progress* (+ 1 *http-requests-in-progress*))) +;; +;; ;; Send "cmd" with json payload "params" to serverdat and receive result +;; ;; +;; (define (http-transport:client-api-send-receive run-id runremote cmd params #!key (numretries 3)) +;; (assert (remote? runremote) "FATAL: http-transport:client-api-send-receive called with serverdat="serverdat) +;; (let* ((fullurl (remote-api-req runremote)) +;; (res (vector #f "uninitialized")) +;; (success #t) +;; (sparams (db:obj->string params transport: 'http)) +;; (server-id (remote-server-id runremote))) +;; (debug:print-info 11 *default-log-port* "cmd=" cmd " fullurl=" fullurl " server-id=" server-id " current time:" (current-seconds)) +;; +;; ;; set up the http-client here +;; (max-retry-attempts 1) +;; ;; consider all requests indempotent +;; (retry-request? (lambda (request) +;; #f)) +;; ;; send the data and get the response +;; ;; extract the needed info from the http data and +;; ;; process and return it. +;; (let* ((send-recieve (lambda () +;; (mutex-lock! *http-mutex*) +;; ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) +;; ;; ((exn http client-error) e (print e))) +;; (set! res (vector ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time. +;; success +;; (db:string->obj +;; (handle-exceptions +;; exn +;; (let ((call-chain (get-call-chain)) +;; (msg ((condition-property-accessor 'exn 'message) exn))) +;; (set! success #f) +;; (if (debug:debug-mode 1) +;; (debug:print-info 0 *default-log-port* "couldn't talk to server, trying again ...") +;; (begin +;; (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") +;; (debug:print 0 *default-log-port* " message: " msg ", exn=" exn) +;; (debug:print 0 *default-log-port* " cmd: " cmd " params: " params " key:" (or server-id "thekey")) +;; (debug:print 0 *default-log-port* " call-chain: " call-chain))) +;; ;; what if another thread is communicating ok? Can't happen due to mutex +;; (http-transport:close-connections runremote) +;; (mutex-unlock! *http-mutex*) +;; ;; (close-connection! fullurl) +;; (db:obj->string #f)) +;; (with-input-from-request ;; was dat +;; fullurl +;; (list (cons 'key (or server-id "thekey")) +;; (cons 'cmd cmd) +;; (cons 'params sparams)) +;; read-string)) +;; transport: 'http) +;; 0)) ;; added this speculatively +;; ;; Shouldn't this be a call to the managed call-all-connections stuff above? +;; ;; (close-all-connections!) ;; BUG? WHY IS THIS HERE? Are we failing to reuse connections? +;; (mutex-unlock! *http-mutex*) +;; )) +;; (time-out (lambda () +;; (thread-sleep! 45) +;; (debug:print 0 *default-log-port* "WARNING: send-receive took more than 45 seconds!!") +;; #f)) +;; (th1 (make-thread send-recieve "with-input-from-request")) +;; (th2 (make-thread time-out "time out"))) +;; (thread-start! th1) +;; (thread-start! th2) +;; (thread-join! th1) +;; (vector-set! res 0 success) +;; (thread-terminate! th2) +;; (if (vector? res) +;; (if (vector-ref res 0) ;; this is the first flag or the second flag? +;; (let* ((res-dat (vector-ref res 1))) +;; (if (and (string? res-dat) (string-contains res-dat "server-id mismatch")) +;; (signal (make-composite-condition +;; (make-property-condition +;; 'servermismatch +;; 'message (vector-ref res 1)))) +;; res)) ;; this is the *inner* vector? seriously? why? +;; (if (debug:debug-mode 11) +;; (let ((call-chain (get-call-chain))) ;; note: this code also called in nmsg-transport - consider consolidating it +;; (print-call-chain (current-error-port)) +;; (debug:print-error 11 *default-log-port* "error above occured at server, res=" res) ;; " message: " ((condition-property-accessor 'exn 'message) exn)) +;; (debug:print 11 *default-log-port* " server call chain:") +;; (pp (vector-ref res 1) (current-error-port)) +;; (signal (vector-ref res 0))) +;; res)) +;; (signal (make-composite-condition +;; (make-property-condition +;; 'timeout +;; 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) +;; +;; ;; careful closing of connections stored in *runremote* +;; ;; +;; (define (http-transport:close-connections runremote) +;; (if (remote? runremote) +;; (let ((api-dat (remote-api-uri runremote))) +;; (handle-exceptions +;; exn +;; (begin +;; (print-call-chain *default-log-port*) +;; (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)) +;; (if (args:any-defined? "-server" "-execute" "-run") +;; (debug:print-info 0 *default-log-port* "Closing connections to "api-dat)) +;; (if api-dat (close-connection! api-dat)) +;; (remote-conndat-set! runremote #f) +;; #t)) +;; #f)) +;; +;; ;; run http-transport:keep-running in a parallel thread to monitor that the db is being +;; ;; used and to shutdown after sometime if it is not. +;; ;; +;; (define (http-transport:keep-running) +;; ;; if none running or if > 20 seconds since +;; ;; server last used then start shutdown +;; ;; This thread waits for the server to come alive +;; (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") +;; (let* ((servinfofile #f) +;; (sdat #f) +;; (no-sync-db (db:open-no-sync-db)) +;; (tmp-area (common:get-db-tmp-area)) +;; (started-file (conc tmp-area "/.server-started")) +;; (server-start-time (current-seconds)) +;; (server-info (let loop ((start-time (current-seconds)) +;; (changed #t) +;; (last-sdat "not this")) +;; (begin ;; let ((sdat #f)) +;; (thread-sleep! 0.01) +;; (debug:print-info 0 *default-log-port* "Waiting for server alive signature") +;; (mutex-lock! *heartbeat-mutex*) +;; (set! sdat *server-info*) +;; (mutex-unlock! *heartbeat-mutex*) +;; (if (and sdat +;; (not changed) +;; (> (- (current-seconds) start-time) 2)) +;; (let* ((servinfodir (server:get-servinfo-dir *toppath*)) ;; (conc *toppath*"/.servinfo")) +;; (ipaddr (car sdat)) +;; (port (cadr sdat)) +;; (servinf (conc servinfodir"/"ipaddr":"port))) +;; (set! servinfofile servinf) +;; (if (not (file-exists? servinfodir)) +;; (create-directory servinfodir #t)) +;; (with-output-to-file servinf +;; (lambda () +;; (let* ((serv-id (server:mk-signature))) +;; (set! *server-id* serv-id) +;; (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)) +;; (print "started: "(seconds->year-week/day-time (current-seconds)))))) +;; (set! *on-exit-procs* (cons +;; (lambda () +;; (delete-file* servinf)) +;; *on-exit-procs*)) +;; ;; put data about this server into a simple flat file host.port +;; (debug:print-info 0 *default-log-port* "Received server alive signature") +;; sdat) +;; (begin +;; (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) +;; (sleep 4) +;; (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes +;; (if sdat +;; (let* ((ipaddr (car sdat)) +;; (port (cadr sdat)) +;; (servinf (conc (server:get-servinfo-dir *toppath*)"/"ipaddr":"port))) +;; (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") +;; (exit)) +;; (exit) +;; ) +;; (loop start-time +;; (equal? sdat last-sdat) +;; sdat))))))) +;; (iface (car server-info)) +;; (port (cadr server-info)) +;; (last-access 0) +;; (server-timeout (server:expiration-timeout)) +;; (server-going #f) +;; (server-log-file (args:get-arg "-log"))) ;; always set when we are a server +;; +;; (handle-exceptions +;; exn +;; (debug:print 0 *default-log-port* "Failed to create " started-file ", exn=" exn) +;; (with-output-to-file started-file (lambda ()(print (current-process-id))))) +;; +;; (let loop ((count 0) +;; (server-state 'available) +;; (bad-sync-count 0) +;; (start-time (current-milliseconds))) +;; +;; ;; Use this opportunity to sync the tmp db to megatest.db +;; (if (not server-going) ;; *dbstruct-dbs* +;; (begin +;; (debug:print 0 *default-log-port* "SERVER: dbprep") +;; (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! +;; (set! server-going #t) +;; (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. +;; (if (and no-sync-db +;; (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :) +;; (begin +;; (if (common:low-noise-print 120 "sync-all-print") +;; (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))) +;; (db:all-db-sync *dbstruct-dbs*) +;; ))) +;; +;; ;; when things go wrong we don't want to be doing the various queries too often +;; ;; so we strive to run this stuff only every four seconds or so. +;; (let* ((sync-time (- (current-milliseconds) start-time)) +;; (rem-time (quotient (- 4000 sync-time) 1000))) +;; (if (and (<= rem-time 4) +;; (> rem-time 0)) +;; (thread-sleep! rem-time))) +;; +;; (if (< count 1) ;; 3x3 = 9 secs aprox +;; (loop (+ count 1) 'running bad-sync-count (current-milliseconds))) +;; +;; ;; Check that iface and port have not changed (can happen if server port collides) +;; (mutex-lock! *heartbeat-mutex*) +;; (set! sdat *server-info*) +;; (mutex-unlock! *heartbeat-mutex*) +;; +;; (if (not (equal? sdat (list iface port))) +;; (let ((new-iface (car sdat)) +;; (new-port (cadr sdat))) +;; (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") +;; (set! iface new-iface) +;; (set! port new-port) +;; (if (not *server-id*) +;; (set! *server-id* (server:mk-signature))) +;; (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) +;; (flush-output *default-log-port*))) +;; +;; ;; Transfer *db-last-access* to last-access to use in checking that we are still alive +;; (mutex-lock! *heartbeat-mutex*) +;; (set! last-access *db-last-access*) +;; (mutex-unlock! *heartbeat-mutex*) +;; +;; (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) +;; (begin +;; (if (not *server-id*) +;; (set! *server-id* (server:mk-signature))) +;; (debug:print 0 *default-log-port* (current-seconds) (current-directory) (current-process-id) (argv)) +;; (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds) " server-id: " *server-id*) +;; (flush-output *default-log-port*))) +;; (if (common:low-noise-print 60 "dbstats") +;; (begin +;; (debug:print 0 *default-log-port* "Server stats:") +;; (db:print-current-query-stats))) +;; (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) +;; (cond +;; ((and *server-run* +;; (> (+ last-access server-timeout) +;; (current-seconds))) +;; (if (common:low-noise-print 120 "server continuing") +;; (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) +;; (let ((curr-time (current-seconds))) +;; (handle-exceptions +;; exn +;; (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn) +;; (if (and ;; (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter +;; (not *server-overloaded*) +;; (file-exists? servinfofile)) +;; (change-file-times servinfofile curr-time curr-time))) +;; (if (and (common:low-noise-print 120 "start new server") +;; (> *api-process-request-count* 50)) ;; if this server is kind of busy start up another +;; (begin +;; (debug:print-info 0 *default-log-port* "Server is busy, api-count "*api-process-request-count*", start another if possible...") +;; (server:kind-run *toppath*) +;; (if (> *api-process-request-count* 100) +;; (begin +;; (debug:print-info 0 *default-log-port* "Server is overloaded at api-count=" *api-process-request-count*", removing "servinfofile) +;; (delete-file* servinfofile))))))) +;; (loop 0 server-state bad-sync-count (current-milliseconds))) +;; (else +;; (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) +;; (http-transport:server-shutdown port))))))) +;; +;; (define (http-transport:server-shutdown port) +;; (begin +;; ;;(BB> "http-transport:server-shutdown called") +;; (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) +;; ;; +;; ;; start_shutdown +;; ;; +;; (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up +;; (portlogger:open-run-close portlogger:set-port port "released") +;; (thread-sleep! 1) +;; +;; ;; (debug:print-info 0 *default-log-port* "Max cached queries was " *max-cache-size*) +;; ;; (debug:print-info 0 *default-log-port* "Number of cached writes " *number-of-writes*) +;; ;; (debug:print-info 0 *default-log-port* "Average cached write time " +;; ;; (if (eq? *number-of-writes* 0) +;; ;; "n/a (no writes)" +;; ;; (/ *writes-total-delay* +;; ;; *number-of-writes*)) +;; ;; " ms") +;; ;; (debug:print-info 0 *default-log-port* "Number non-cached queries " *number-non-write-queries*) +;; ;; (debug:print-info 0 *default-log-port* "Average non-cached time " +;; ;; (if (eq? *number-non-write-queries* 0) +;; ;; "n/a (no queries)" +;; ;; (/ *total-non-write-delay* +;; ;; *number-non-write-queries*)) +;; ;; " ms") +;; +;; (db:print-current-query-stats) +;; #;(common:save-pkt `((action . exit) +;; (T . server) +;; (pid . ,(current-process-id))) +;; *configdat* #t) +;; +;; ;; remove .servinfo file(s) here +;; +;; (debug:print-info 0 *default-log-port* "Server shutdown complete. Exiting") +;; (exit))) +;; +;; ;; all routes though here end in exit ... +;; ;; +;; ;; start_server? +;; ;; +;; (define (http-transport:launch) +;; ;; check the .servinfo directory, are there other servers running on this +;; ;; or another host? +;; (let* ((server-start-is-ok (server:minimal-check *toppath*))) +;; (if (not server-start-is-ok) +;; (begin +;; (debug:print 0 *default-log-port* "ERROR: server start not ok, exiting now.") +;; (exit 1)))) +;; +;; ;; check that a server start is in progress, pause or exit if so +;; (let* ((th2 (make-thread (lambda () +;; (debug:print-info 0 *default-log-port* "Server run thread started") +;; (http-transport:run +;; (if (args:get-arg "-server") +;; (args:get-arg "-server") +;; "-") +;; )) "Server run")) +;; (th3 (make-thread (lambda () +;; (debug:print-info 0 *default-log-port* "Server monitor thread started") +;; (http-transport:keep-running) +;; "Keep running")))) +;; (thread-start! th2) +;; (thread-sleep! 0.25) ;; give the server time to settle before starting the keep-running monitor. +;; (thread-start! th3) +;; (set! *didsomething* #t) +;; (thread-join! th2) +;; (exit))) +;; +;; ;; (define (http-transport:server-signal-handler signum) +;; ;; (signal-mask! signum) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (debug:print 0 *default-log-port* " ... exiting ...") +;; ;; (let ((th1 (make-thread (lambda () +;; ;; (thread-sleep! 1)) +;; ;; "eat response")) +;; ;; (th2 (make-thread (lambda () +;; ;; (debug:print-error 0 *default-log-port* "Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.") +;; ;; (thread-sleep! 3) ;; give the flush three seconds to do it's stuff +;; ;; (debug:print 0 *default-log-port* " Done.") +;; ;; (exit 4)) +;; ;; "exit on ^C timer"))) +;; ;; (thread-start! th2) +;; ;; (thread-start! th1) +;; ;; (thread-join! th2)))) +;; +;; ;;=============================================== +;; ;; Java script +;; ;;=============================================== +;; (define (http-transport:show-jquery) +;; (let* ((data (tests:readlines *java-script-lib*))) +;; (string-join data "\n"))) +;; +;; +;; +;; ;;====================================================================== +;; ;; web pages +;; ;;====================================================================== +;; +;; (define (http-transport:html-test-log $) +;; (let* ((run-id ($ 'runid)) +;; (test-item ($ 'testname)) +;; (parts (string-split test-item ":")) +;; (test-name (car parts)) +;; +;; (item-name (if (equal? (length parts) 1) +;; "" +;; (cadr parts)))) +;; ;(print $) +;; (tests:get-test-log run-id test-name item-name))) +;; +;; +;; (define (http-transport:html-dboard $) +;; (let* ((page ($ 'page)) +;; (oup (open-output-string)) +;; (bdy "--------------------------") +;; +;; (ret (tests:dynamic-dboard page))) +;; (s:output-new oup ret) +;; (close-output-port oup) +;; +;; (set! bdy (get-output-string oup)) +;; (conc "

Dashboard

" bdy "

" ))) +;; +;; (define (http-transport:main-page) +;; (let ((linkpath (root-path))) +;; (conc "

" (pathname-strip-directory *toppath*) "

" +;; "" +;; "Run area: " *toppath* +;; "

Server Stats

" +;; (http-transport:stats-table) +;; "
" +;; (http-transport:runs linkpath) +;; "
" +;; ;; (http-transport:run-stats) +;; "" +;; ))) +;; +;; (define (http-transport:stats-table) +;; (mutex-lock! *heartbeat-mutex*) +;; (let ((res +;; (conc "" +;; ;; "" +;; "" +;; "" +;; "" +;; ;; "" +;; "" +;; "
Max cached queries " *max-cache-size* "
Number of cached writes " *number-of-writes* "
Average cached write time " (if (eq? *number-of-writes* 0) +;; "n/a (no writes)" +;; (/ *writes-total-delay* +;; *number-of-writes*)) +;; " ms
Number non-cached queries " *number-non-write-queries* "
Average non-cached time " (if (eq? *number-non-write-queries* 0) +;; ;; "n/a (no queries)" +;; ;; (/ *total-non-write-delay* +;; ;; *number-non-write-queries*)) +;; " ms
Last access" (seconds->time-string *db-last-access*) "
"))) +;; (mutex-unlock! *heartbeat-mutex*) +;; res)) +;; +;; (define (http-transport:runs linkpath) +;; (conc "

Runs

" +;; (string-intersperse +;; (let ((files (map pathname-strip-directory (glob (conc linkpath "/*"))))) +;; (map (lambda (p) +;; (conc "" p "
")) +;; files)) +;; " "))) +;; +;; #;(define (http-transport:run-stats) +;; (let ((stats (open-run-close db:get-running-stats #f))) +;; (conc "" +;; (string-intersperse +;; (map (lambda (stat) +;; (conc "")) +;; stats) +;; " ") +;; "
" (car stat) "" (cadr stat) "
"))) +;; +;; ;; http-server send-response +;; ;; api:process-request +;; ;; db:* +;; ;; +;; ;; NB// Runs on the server as part of the server loop +;; ;; +;; (define (api:process-request dbstruct $) ;; the $ is the request vars proc +;; (debug:print 4 *default-log-port* "server-id:" *server-id*) +;; (let* ((cmd ($ 'cmd)) +;; (paramsj ($ 'params)) +;; (key ($ 'key)) +;; (params (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?) +;; (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key) +;; (if (equal? key *server-id*) +;; (begin +;; (set! *api-process-request-count* (+ *api-process-request-count* 1)) +;; (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) +;; (success (vector-ref resdat 0)) +;; (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) +;; (debug:print 4 *default-log-port* "res:" res) +;; (if (not success) +;; (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) +;; (if (> *api-process-request-count* *max-api-process-requests*) +;; (set! *max-api-process-requests* *api-process-request-count*)) +;; (set! *api-process-request-count* (- *api-process-request-count* 1)) +;; ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds +;; ;; (rmt:dat->json-str +;; ;; (if (or (string? res) +;; ;; (list? res) +;; ;; (number? res) +;; ;; (boolean? res)) +;; ;; res +;; ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) +;; (db:obj->string res transport: 'http))) +;; (begin +;; (debug:print 0 *default-log-port* "Server refused to process request. Server id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) +;; (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http))))) +;; +;; Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -25,12 +25,10 @@ (declare (uses common)) ;; (declare (uses megatest-version)) (declare (uses margs)) (declare (uses runs)) (declare (uses launch)) -(declare (uses server)) -(declare (uses client)) (declare (uses tests)) (declare (uses genexample)) ;; (declare (uses daemon)) (declare (uses db)) @@ -43,10 +41,13 @@ (declare (uses env)) (declare (uses diff-report)) (declare (uses db)) (declare (uses dbmod)) (declare (uses dbmod.import)) +(declare (uses rmtmod)) +(declare (uses clientmod)) +(declare (uses servermod)) (declare (uses commonmod)) (declare (uses commonmod.import)) (declare (uses dbfile)) (declare (uses dbfile.import)) ;; (declare (uses debugprint)) @@ -57,11 +58,12 @@ ;; (declare (uses ftail)) ;; (import ftail) (import dbmod commonmod - dbfile) + dbfile + servermod) (define *db* #f) ;; this is only for the repl, do not use in general!!!! (include "common_records.scm") (include "key_records.scm") @@ -921,13 +923,29 @@ ;;====================================================================== ;; Server? Start up here. ;; (if (args:get-arg "-server") - (let ((tl (launch:setup))) + (let* ((tl (launch:setup)) + (srvdat (server:setup tl)) + (handler (lambda (dbstruct cmd params) + (api:execute-requests dbstruct (if (string? cmd) + (string->symbol cmd) + cmd) + (db:string->obj params))))) + (server:set-handler srvdat handler) + (srv-obj-to-str-set! srvdat db:obj->string) + (srv-str-to-obj-set! srvdat db:string->obj) + (srv-dbstruct-set! srvdat (db:setup #t)) + (thread-join! + (thread-start! (make-thread + (lambda () + (server:run srvdat))))) + ;; (server:launch 0 'http) - (http-transport:launch) + ;; (http-transport:launch) ;; NOTE: Need to replace this call + (set! *didsomething* #t))) ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to ;; a specific Megatest area. Detail are being hashed out and this may change. ;; Index: mt.scm ================================================================== --- mt.scm +++ mt.scm @@ -24,13 +24,13 @@ (declare (uses db)) (declare (uses common)) (declare (uses items)) (declare (uses runconfig)) (declare (uses tests)) -(declare (uses server)) +(declare (uses servermod)) (declare (uses runs)) -(declare (uses rmt)) +(declare (uses rmtmod)) ;; (declare (uses filedb)) (include "common_records.scm") (include "key_records.scm") (include "db_records.scm") ADDED mtserv.scm Index: mtserv.scm ================================================================== --- /dev/null +++ mtserv.scm @@ -0,0 +1,117 @@ +; Copyright 2006-2017, Matthew Welland. +;; +;; This file is part of Megatest. +;; +;; Megatest is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. +;; +;; Megatest is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with Megatest. If not, see . +;; + +;; (include "common.scm") +;; (include "megatest-version.scm") + +;; fake out readline usage of toplevel-command +(define (toplevel-command . a) #f) + +(use srfi-1 posix srfi-69 readline ;; regex regex-case srfi-69 apropos json http-client directory-utils rpc typed-records;; (srfi 18) extras) + srfi-19 srfi-18 extras format regex regex-case + (prefix dbi dbi:) + matchable + ) + +;; (declare (uses common)) +(declare (uses margs)) +(declare (uses configfmod)) +(declare (uses servermod)) + +(include "megatest-version.scm") +(include "megatest-fossil-hash.scm") + +(define help (conc " +mtserv, part of the Megatest tool suite, documentation at http://www.kiatoa.com/fossils/megatest + version " megatest-version " + license GPL, Copyright Matt Welland 2006-2017 + +Usage: mtserv action [options] + -h : this help + -manual : show the Megatest user manual + -version : print megatest version (currently " megatest-version ") + -start-dir path : switch to dir at start + +actions: + + server : start server + repl : start repl + +Examples: + +Called as " (string-intersperse (argv) " ") " +Version " megatest-version ", built from " megatest-fossil-hash )) + ;; first token is our action, but only if no leading dash + +(define *action* (if (and (> (length (argv)) 1) + (not (string-match "^\\-.*" (cadr (argv))))) + (cadr (argv)) + #f)) + +(define *remargs* + (args:get-args + (if *action* (cdr (argv)) (argv)) + '("-log") + '("-h" + ) + args:arg-hash + 0)) + +(if (args:get-arg "-h") + (begin + (print help) + (exit))) + +(if (args:get-arg "-start-dir") + (let* ((start-dir (args:get-arg "-start-dir"))) + (if (and (file-exists? start-dir) + (directory? start-dir)) + (change-directory start-dir) + (begin + (print "FATAL: cannot find or access "start-dir) + (exit 1))))) + +(if *action* + (case (string->symbol *action*) + ((server) (server:run)) + ((repl) + (import extras) ;; might not be needed + ;; (import csi) + (import readline) + (import apropos) + ;; (import (prefix sqlite3 sqlite3:)) ;; doesn't work ... + + (install-history-file (get-environment-variable "HOME") ".mtserv_history") ;; [homedir] [filename] [nlines]) + (current-input-port (make-readline-port "mtserv> ")) + (print "Starting repl...") + (repl)) + ;; (if (args:get-arg "-load") + ;; (load (args:get-arg "-load")) + ;; (repl))) + (else + (print "Action \""*action*"\" not recognised.") + (print help))) + (begin + (print "No action provided.") + (print help))) + +#| +(define mtconf (car (simple-setup #f))) +(define dat (common:with-queue-db mtconf (lambda (conn)(get-pkts conn '())))) +(pp (pkts#flatten-all dat '((cmd . ((parent . P)(url . M)))(runtype . ((parent . P)))) 'id 'group-id 'uuid 'parent 'pkt-type 'pkt 'processed)) +|# Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -22,1056 +22,1060 @@ (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (declare (uses dbfile)) + (include "common_records.scm") ;; (declare (uses rmtmod)) (import dbfile) ;; rmtmod) -;; -;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! -;; - -;; generate entries for ~/.megatestrc with the following -;; -;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u - -;;====================================================================== -;; S U P P O R T F U N C T I O N S -;;====================================================================== - -;; if a server is either running or in the process of starting call client:setup -;; else return #f to let the calling proc know that there is no server available -;; -(define (rmt:get-connection-info areapath runremote) ;; TODO: push areapath down. - (let* ((cinfo (if (remote? runremote) - (remote-conndat runremote) - #f))) - (if cinfo - cinfo - (if (server:check-if-running areapath) - (client:setup areapath runremote) - #f)))) - -(define (rmt:on-homehost? runremote) - (let* ((hh-dat (remote-hh-dat runremote))) - (if (pair? hh-dat) - (cdr hh-dat) - (begin - (debug:print-info 0 *default-log-port* "hh-dat="hh-dat) - #f)))) - - -;;====================================================================== - -(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id - -;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) -;; -(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected - - #;(common:telemetry-log (conc "rmt:"(->string cmd)) - payload: `((rid . ,rid) - (params . ,params))) - - (if (> attemptnum 2) - (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) - - (cond - ((> attemptnum 2) (thread-sleep! 0.05)) - ((> attemptnum 10) (thread-sleep! 0.5)) - ((> attemptnum 20) (thread-sleep! 1))) - (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) - (begin (server:run *toppath*) (thread-sleep! 3))) - - - ;;DOT digraph megatest_state_status { - ;;DOT ranksep=0; - ;;DOT // rankdir=LR; - ;;DOT node [shape="box"]; - ;;DOT "rmt:send-receive" -> MUTEXLOCK; - ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } - ;; do all the prep locked under the rmt-mutex - (mutex-lock! *rmt-mutex*) - - ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote - ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. - ;; 3. do the query, if on homehost use local access - ;; - (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value - (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas - (runremote (or area-dat - *runremote*)) - (attemptnum (+ 1 attemptnum)) - (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))) - - ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity - ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"]; - ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; - ;; ensure we have a record for our connection for given area - (if (not runremote) ;; can remove this one. should never get here. - (begin - (set! *runremote* (make-remote)) - (let* ((server-info (remote-server-info *runremote*))) - (if server-info - (begin - (remote-server-url-set! *runremote* (server:record->url server-info)) - (remote-server-id-set! *runremote* (server:record->id server-info))))) - (set! runremote *runremote*))) ;; new runremote will come from this on next iteration - - ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity - ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; - ;; DOT SET_HOMEHOST -> MUTEXLOCK; - ;; ensure we have a homehost record - (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost - (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little - (let ((hh-data (server:choose-server areapath 'homehost))) - (remote-hh-dat-set! runremote (or hh-data (cons #f #f))))) - - ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) - (cond - #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds - (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.") - (set! *runremote* #f) - ;; BUG: close-connections should go here? - (mutex-unlock! *rmt-mutex*) - (rmt:send-receive cmd rid params attemptnum: 1 area-dat: area-dat)) - - ;;DOT EXIT; - ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } - ;; give up if more than 150 attempts - ((> attemptnum 150) - (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.") - (exit 1)) - - ;;DOT CASE2 [label="local\nreadonly\nquery"]; - ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} - ;;DOT CASE2 -> "rmt:open-qry-close-locally"; - ;; readonly mode, read request- handle it - case 2 - ((and readonly-mode - (member cmd api:read-only-queries)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") - (rmt:open-qry-close-locally cmd 0 params) - ) - - ;;DOT CASE3 [label="write in\nread-only mode"]; - ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3} - ;;DOT CASE3 -> "#f"; - ;; readonly mode, write request. Do nothing, return #f - (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params)) - - ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. - ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. - ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) - ;; - ;;DOT CASE4 [label="reset\nconnection"]; - ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} - ;;DOT CASE4 -> "rmt:send-receive"; - ;; reset the connection if it has been unused too long - ((and runremote - (remote-conndat runremote) - (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on - (+ (remote-last-access runremote) - (remote-server-timeout runremote)))) - (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses in " (remote-server-timeout runremote) " seconds, forcing new connection.") - (http-transport:close-connections runremote) - ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections - ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. - (mutex-unlock! *rmt-mutex*) - (rmt:send-receive cmd rid params attemptnum: attemptnum)) - - ;;DOT CASE5 [label="local\nread"]; - ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; - ;;DOT CASE5 -> "rmt:open-qry-close-locally"; - - ;; on homehost and this is a read - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (rmt:on-homehost? runremote) - (member cmd api:read-only-queries)) ;; this is a read - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") - (rmt:open-qry-close-locally cmd 0 params)) - - ;;DOT CASE6 [label="init\nremote"]; - ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; - ;;DOT CASE6 -> "rmt:send-receive"; - ;; on homehost and this is a write, we already have a server, but server has died - - ;; reinstate this keep-alive section but inject a time condition into the (add ... - ;; - ;; ((and (cdr (remote-hh-dat runremote)) ;; on homehost - ;; (not (member cmd api:read-only-queries)) ;; this is a write - ;; (remote-server-url runremote) ;; have a server - ;; (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. - ;; (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6") - ;; (http-transport:close-connections area-dat: runremote) ;; make sure to clean up - ;; (set! *runremote* (make-remote)) - ;; (let* ((server-info (remote-server-info *runremote*))) - ;; (if server-info - ;; (begin - ;; (remote-server-url-set! *runremote* (server:record->url server-info)) - ;; (remote-server-id-set! *runremote* (server:record->id server-info))))) - ;; (remote-force-server-set! runremote (common:force-server?)) - ;; (mutex-unlock! *rmt-mutex*) - ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") - ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) - - ;;DOT CASE7 [label="homehost\nwrite"]; - ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; - ;;DOT CASE7 -> "rmt:open-qry-close-locally"; - ;; on homehost and this is a write, we already have a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; on homehost - (not (member cmd api:read-only-queries)) ;; this is a write - (remote-server-url runremote)) ;; have a server - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") - (rmt:open-qry-close-locally cmd 0 params)) - - ;;DOT CASE8 [label="force\nserver"]; - ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8}; - ;;DOT CASE8 -> "rmt:open-qry-close-locally"; - ;; on homehost, no server contact made and this is a write, passively start a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required - (cdr (remote-hh-dat runremote)) ;; have homehost - (not (remote-server-url runremote)) ;; no connection yet - (not (member cmd api:read-only-queries))) ;; not a read-only query - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") - (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call - (if server-info - (begin - (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed - (remote-server-id-set! runremote (server:record->id server-info))) - (if (common:force-server?) - (server:start-and-wait *toppath*) - (server:kind-run *toppath*))) - (remote-force-server-set! runremote (common:force-server?)) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") - (rmt:open-qry-close-locally cmd 0 params))) - - ;;DOT CASE9 [label="force server\nnot on homehost"]; - ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; - ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; - ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one - (not (remote-conndat runremote))) - (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost - (not (remote-conndat runremote)))) ;; and no connection - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) - (mutex-unlock! *rmt-mutex*) - (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? - (server:start-and-wait *toppath*)) - (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http - (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as - - ;;DOT CASE10 [label="on homehost"]; - ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; - ;;DOT CASE10 -> "rmt:open-qry-close-locally"; - ;; all set up if get this far, dispatch the query - ((and (not (remote-force-server runremote)) - (cdr (remote-hh-dat runremote))) ;; we are on homehost - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10") - (rmt:open-qry-close-locally cmd (if rid rid 0) params)) - - ;;DOT CASE11 [label="send_receive"]; - ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; - ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; - ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; - ;; not on homehost, do server query - (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))) - ;;DOT } - -;; bunch of small functions factored out of send-receive to make debug easier -;; - -(define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid) - ;; (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") - ;; (mutex-lock! *rmt-mutex*) - (let* ((conninfo (remote-conndat runremote)) - (dat-in (condition-case ;; handling here has - ;; caused a lot of - ;; problems. However it - ;; is needed to deal with - ;; attemtped - ;; communication to - ;; servers that have gone - ;; away - (http-transport:client-api-send-receive 0 runremote cmd params) - ;; (http-transport:client-api-send-receive 0 conninfo cmd params runremote) - ((servermismatch) (vector #f "Server id mismatch" )) - ((commfail)(vector #f "communications fail")) - ((exn)(vector #f "other fail" (print-call-chain))))) - (dat (if (and (vector? dat-in) ;; ... check it is a correct size - (> (vector-length dat-in) 1)) - dat-in - (vector #f (conc "communications fail (type 2), dat-in=" dat-in)))) - (success (if (vector? dat) (vector-ref dat 0) #f)) - (res (if (vector? dat) (vector-ref dat 1) #f))) - (if (and (vector? conninfo) (< 5 (vector-length conninfo))) - (remote-last-access-set! runremote (current-seconds)) ;; refresh access time - (begin - (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo) - (set! conninfo #f) - (http-transport:close-connections runremote))) - (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) - (mutex-unlock! *rmt-mutex*) - (if success ;; success only tells us that the transport was - ;; successful, have to examine the data to see if - ;; there was a detected issue at the other end - (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd) - (begin - (debug:print-error 0 *default-log-port* " dat=" dat) - (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params)) - ))) - -(define (rmt:print-db-stats) - (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" - (debug:print 18 *default-log-port* "DB Stats\n========") - (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) - (for-each (lambda (cmd) - (let ((cmd-dat (hash-table-ref *db-stats* cmd))) - (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0)))))) - (sort (hash-table-keys *db-stats*) - (lambda (a b) - (> (vector-ref (hash-table-ref *db-stats* a) 0) - (vector-ref (hash-table-ref *db-stats* b) 0))))))) - -(define (rmt:get-max-query-average run-id) - (mutex-lock! *db-stats-mutex*) - (let* ((runkey (conc "run-id=" run-id " ")) - (cmds (filter (lambda (x) - (substring-index runkey x)) - (hash-table-keys *db-stats*))) - (res (if (null? cmds) - (cons 'none 0) - (let loop ((cmd (car cmds)) - (tal (cdr cmds)) - (max-cmd (car cmds)) - (res 0)) - (let* ((cmd-dat (hash-table-ref *db-stats* cmd)) - (tot (vector-ref cmd-dat 0)) - (curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction - (currmax (max res curravg)) - (newmax-cmd (if (> curravg res) cmd max-cmd))) - (if (null? tal) - (if (> tot 10) - (cons newmax-cmd currmax) - (cons 'none 0)) - (loop (car tal)(cdr tal) newmax-cmd currmax))))))) - (mutex-unlock! *db-stats-mutex*) - res)) - -(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) - (let* ((qry-is-write (not (member cmd api:read-only-queries))) - (db-file-path (db:dbfile-path)) ;; 0)) - (dbstructs-local (db:setup #t)) ;; make-dbr:dbstruct path: dbdir local: #t))) - (read-only (not (file-write-access? db-file-path))) - (start (current-milliseconds)) - (resdat (if (not (and read-only qry-is-write)) - (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params)))) - ;; (handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong.. - ;; exn ;; This is an attempt to detect that situation and recover gracefully - ;; (begin - ;; (debug:print 0 *default-log-port* "ERROR: bad data from server " v " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) - ;; (vector #t '())) ;; should always get a vector but if something goes wrong return a dummy - (if (and (vector? v) - (> (vector-length v) 1)) - (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) - newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record - (vector #t '()))) ;; ) ;; we could also check that the returned types are valid - (vector #t '()))) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1)) - (duration (- (current-milliseconds) start))) - (if (and read-only qry-is-write) - (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd)) - (if (not success) - (if (> remretries 0) - (begin - (debug:print-error 0 *default-log-port* "local query failed. Trying again.") - (thread-sleep! (/ (random 5000) 1000)) ;; some random delay - (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) - (begin - (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up") - #f)) - (begin - ;; (rmt:update-db-stats run-id cmd params duration) - ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it - (if qry-is-write - (let ((start-time (current-seconds))) - (mutex-lock! *db-multi-sync-mutex*) -/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) - (mutex-unlock! *db-multi-sync-mutex*))))) - res)) - -(define (rmt:send-receive-no-auto-client-setup runremote cmd run-id params) - (let* ((run-id (if run-id run-id 0)) - (res (http-transport:client-api-send-receive run-id runremote cmd params))) - (if (and res (vector-ref res 0)) - (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! - #f))) - -;;====================================================================== -;; -;; A C T U A L A P I C A L L S -;; -;;====================================================================== - -;;====================================================================== -;; S E R V E R -;;====================================================================== - -(define (rmt:kill-server run-id) - (rmt:send-receive 'kill-server run-id (list run-id))) - -(define (rmt:start-server run-id) - (rmt:send-receive 'start-server 0 (list run-id))) - -;;====================================================================== -;; M I S C -;;====================================================================== - -(define (rmt:login run-id) - (rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature)))) - -;; This login does no retries under the hood - it acts a bit like a ping. -;; Deprecated for nmsg-transport. -;; -(define (rmt:login-no-auto-client-setup runremote) - (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature)))) - -;; hand off a call to one of the db:queries statements -;; added run-id to make looking up the correct db possible -;; -(define (rmt:general-call stmtname run-id . params) - (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params))) - - -;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host -(define (rmt:get-latest-host-load hostname) - (rmt:send-receive 'get-latest-host-load 0 (list hostname))) - -(define (rmt:sdb-qry qry val run-id) - ;; add caching if qry is 'getid or 'getstr - (rmt:send-receive 'sdb-qry run-id (list qry val))) - -;; NOT COMPLETED -(define (rmt:runtests user run-id testpatt params) - (rmt:send-receive 'runtests run-id testpatt)) - -(define (rmt:get-run-record-ids target run keynames test-patt) - (rmt:send-receive 'get-run-record-ids #f (list target run keynames test-patt))) - -(define (rmt:get-changed-record-ids since-time) - (rmt:send-receive 'get-changed-record-ids #f (list since-time)) ) - -(define (rmt:drop-all-triggers) - (rmt:send-receive 'drop-all-triggers #f '())) - -(define (rmt:create-all-triggers) - (rmt:send-receive 'create-all-triggers #f '())) - -;;====================================================================== -;; T E S T M E T A -;;====================================================================== - -(define (rmt:get-tests-tags) - (rmt:send-receive 'get-tests-tags #f '())) - -;;====================================================================== -;; K E Y S -;;====================================================================== - -;; These require run-id because the values come from the run! -;; -(define (rmt:get-key-val-pairs run-id) - (rmt:send-receive 'get-key-val-pairs run-id (list run-id))) - -(define (rmt:get-keys) - (if *db-keys* *db-keys* - (let ((res (rmt:send-receive 'get-keys #f '()))) - (set! *db-keys* res) - res))) - -(define (rmt:get-keys-write) ;; dummy query to force server start - (let ((res (rmt:send-receive 'get-keys-write #f '()))) - (set! *db-keys* res) - res)) - -;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe -;; to cache the resuls in a hash -;; -(define (rmt:get-key-vals run-id) - (or (hash-table-ref/default *keyvals* run-id #f) - (let ((res (rmt:send-receive 'get-key-vals #f (list run-id)))) - (hash-table-set! *keyvals* run-id res) - res))) - -(define (rmt:get-targets) - (rmt:send-receive 'get-targets #f '())) - -(define (rmt:get-target run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-target run-id (list run-id))) - -(define (rmt:get-run-times runpatt targetpatt) - (rmt:send-receive 'get-run-times #f (list runpatt targetpatt ))) - - -;;====================================================================== -;; T E S T S -;;====================================================================== - -;; Just some syntatic sugar -(define (rmt:register-test run-id test-name item-path) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:general-call 'register-test run-id run-id test-name item-path)) - -(define (rmt:get-test-id run-id testname item-path) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) - -;; run-id is NOT used -;; -(define (rmt:get-test-info-by-id run-id test-id) - (if (number? test-id) - (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) - (begin - (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) - (print-call-chain (current-error-port)) - #f))) - -(define (rmt:test-get-rundir-from-test-id run-id test-id) - (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) - -(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f)) - (assert (number? run-id) "FATAL: Run id required.") - (let* ((test-path (if (string? work-area) - work-area - (rmt:test-get-rundir-from-test-id run-id test-id)))) - (debug:print 3 *default-log-port* "TEST PATH: " test-path) - (open-test-db test-path))) - -;; WARNING: This currently bypasses the transaction wrapped writes system -(define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment))) - -(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus))) - -(define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) - (assert (number? run-id) "FATAL: Run id required.") - ;; (if (number? run-id) - (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode))) - ;; (begin - ;; (debug:print-error 0 *default-log-port* "rmt:get-tests-for-run called with bad run-id=" run-id) - ;; (print-call-chain (current-error-port)) - ;; '()))) - -(define (rmt:get-tests-for-run-state-status run-id testpatt last-update) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-tests-for-run-state-status run-id (list run-id testpatt last-update))) - -;; get stuff via synchash -(define (rmt:synchash-get run-id proc synckey keynum params) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'synchash-get run-id (list run-id proc synckey keynum params))) - -(define (rmt:get-tests-for-run-mindata run-id testpatt states status not-in) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-id testpatt states status not-in))) - -;; IDEA: Threadify these - they spend a lot of time waiting ... -;; -(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) - (let ((multi-run-mutex (make-mutex)) - (run-id-list (if run-ids - run-ids - (rmt:get-all-run-ids))) - (result '())) - (if (null? run-id-list) - '() - (let loop ((hed (car run-id-list)) - (tal (cdr run-id-list)) - (threads '())) - (if (> (length threads) 5) - (loop hed tal (filter (lambda (th)(not (member (thread-state th) '(terminated dead)))) threads)) - (let* ((newthread (make-thread - (lambda () - (let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in)))) - (if (list? res) - (begin - (mutex-lock! multi-run-mutex) - (set! result (append result res)) - (mutex-unlock! multi-run-mutex)) - (debug:print-error 0 *default-log-port* "get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in)))) - (conc "multi-run-thread for run-id " hed))) - (newthreads (cons newthread threads))) - (thread-start! newthread) - (thread-sleep! 0.05) ;; give that thread some time to start - (if (null? tal) - newthreads - (loop (car tal)(cdr tal) newthreads)))))) - result)) - +;; ;; +;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! +;; ;; +;; +;; ;; generate entries for ~/.megatestrc with the following +;; ;; +;; ;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u +;; +;; ;;====================================================================== +;; ;; S U P P O R T F U N C T I O N S +;; ;;====================================================================== +;; +;; ;; if a server is either running or in the process of starting call client:setup +;; ;; else return #f to let the calling proc know that there is no server available +;; ;; +;; (define (rmt:get-connection-info areapath runremote) ;; TODO: push areapath down. +;; (let* ((cinfo (if (remote? runremote) +;; (remote-conndat runremote) +;; #f))) +;; (if cinfo +;; cinfo +;; (if (server:check-if-running areapath) +;; (client:setup areapath runremote) +;; #f)))) +;; +;; (define (rmt:on-homehost? runremote) +;; (let* ((hh-dat (remote-hh-dat runremote))) +;; (if (pair? hh-dat) +;; (cdr hh-dat) +;; (begin +;; (debug:print-info 0 *default-log-port* "hh-dat="hh-dat) +;; #f)))) +;; +;; +;; ;;====================================================================== +;; +;; (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id +;; +;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected +;; +;; ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) +;; ;; +;; ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected +;; ;; #;(common:telemetry-log (conc "rmt:"(->string cmd)) +;; ;; payload: `((rid . ,rid) +;; ;; (params . ,params))) +;; ;; +;; ;; (if (> attemptnum 2) +;; ;; (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) +;; ;; +;; ;; (cond +;; ;; ((> attemptnum 2) (thread-sleep! 0.05)) +;; ;; ((> attemptnum 10) (thread-sleep! 0.5)) +;; ;; ((> attemptnum 20) (thread-sleep! 1))) +;; ;; (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) +;; ;; (begin (server:run *toppath*) (thread-sleep! 3))) +;; ;; +;; ;; +;; ;; ;;DOT digraph megatest_state_status { +;; ;; ;;DOT ranksep=0; +;; ;; ;;DOT // rankdir=LR; +;; ;; ;;DOT node [shape="box"]; +;; ;; ;;DOT "rmt:send-receive" -> MUTEXLOCK; +;; ;; ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } +;; ;; ;; do all the prep locked under the rmt-mutex +;; ;; (mutex-lock! *rmt-mutex*) +;; ;; +;; ;; ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote +;; ;; ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. +;; ;; ;; 3. do the query, if on homehost use local access +;; ;; ;; +;; ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value +;; ;; (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas +;; ;; (runremote (or area-dat +;; ;; *runremote*)) +;; ;; (attemptnum (+ 1 attemptnum)) +;; ;; (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))) +;; ;; +;; ;; ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity +;; ;; ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"]; +;; ;; ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; +;; ;; ;; ensure we have a record for our connection for given area +;; ;; (if (not runremote) ;; can remove this one. should never get here. +;; ;; (begin +;; ;; (set! *runremote* (make-remote)) +;; ;; (let* ((server-info (remote-server-info *runremote*))) +;; ;; (if server-info +;; ;; (begin +;; ;; (remote-server-url-set! *runremote* (server:record->url server-info)) +;; ;; (remote-server-id-set! *runremote* (server:record->id server-info))))) +;; ;; (set! runremote *runremote*))) ;; new runremote will come from this on next iteration +;; ;; +;; ;; ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity +;; ;; ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; +;; ;; ;; DOT SET_HOMEHOST -> MUTEXLOCK; +;; ;; ;; ensure we have a homehost record +;; ;; (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost +;; ;; (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little +;; ;; (let ((hh-data (server:choose-server areapath 'homehost))) +;; ;; (remote-hh-dat-set! runremote (or hh-data (cons #f #f))))) +;; ;; +;; ;; ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) +;; ;; (cond +;; ;; #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds +;; ;; (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.") +;; ;; (set! *runremote* #f) +;; ;; ;; BUG: close-connections should go here? +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (rmt:send-receive cmd rid params attemptnum: 1 area-dat: area-dat)) +;; ;; +;; ;; ;;DOT EXIT; +;; ;; ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } +;; ;; ;; give up if more than 150 attempts +;; ;; ((> attemptnum 150) +;; ;; (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.") +;; ;; (exit 1)) +;; ;; +;; ;; ;;DOT CASE2 [label="local\nreadonly\nquery"]; +;; ;; ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} +;; ;; ;;DOT CASE2 -> "rmt:open-qry-close-locally"; +;; ;; ;; readonly mode, read request- handle it - case 2 +;; ;; ((and readonly-mode +;; ;; (member cmd api:read-only-queries)) +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") +;; ;; (rmt:open-qry-close-locally cmd 0 params) +;; ;; ) +;; ;; +;; ;; ;;DOT CASE3 [label="write in\nread-only mode"]; +;; ;; ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3} +;; ;; ;;DOT CASE3 -> "#f"; +;; ;; ;; readonly mode, write request. Do nothing, return #f +;; ;; (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params)) +;; ;; +;; ;; ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. +;; ;; ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. +;; ;; ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) +;; ;; ;; +;; ;; ;;DOT CASE4 [label="reset\nconnection"]; +;; ;; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} +;; ;; ;;DOT CASE4 -> "rmt:send-receive"; +;; ;; ;; reset the connection if it has been unused too long +;; ;; ((and runremote +;; ;; ;; (remote-conndat runremote) +;; ;; (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on +;; ;; (+ (remote-last-access runremote) +;; ;; (remote-server-timeout runremote)))) +;; ;; (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") +;; ;; (http-transport:close-connections runremote) +;; ;; ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections +;; ;; ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) +;; ;; +;; ;; ;;DOT CASE5 [label="local\nread"]; +;; ;; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; +;; ;; ;;DOT CASE5 -> "rmt:open-qry-close-locally"; +;; ;; +;; ;; ;; on homehost and this is a read +;; ;; ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required +;; ;; (rmt:on-homehost? runremote) +;; ;; (member cmd api:read-only-queries)) ;; this is a read +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") +;; ;; (rmt:open-qry-close-locally cmd 0 params)) +;; ;; +;; ;; ;;DOT CASE6 [label="init\nremote"]; +;; ;; ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; +;; ;; ;;DOT CASE6 -> "rmt:send-receive"; +;; ;; ;; on homehost and this is a write, we already have a server, but server has died +;; ;; +;; ;; ;; reinstate this keep-alive section but inject a time condition into the (add ... +;; ;; ;; +;; ;; ;; ((and (cdr (remote-hh-dat runremote)) ;; on homehost +;; ;; ;; (not (member cmd api:read-only-queries)) ;; this is a write +;; ;; ;; (remote-server-url runremote) ;; have a server +;; ;; ;; (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. +;; ;; ;; (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6") +;; ;; ;; (http-transport:close-connections area-dat: runremote) ;; make sure to clean up +;; ;; ;; (set! *runremote* (make-remote)) +;; ;; ;; (let* ((server-info (remote-server-info *runremote*))) +;; ;; ;; (if server-info +;; ;; ;; (begin +;; ;; ;; (remote-server-url-set! *runremote* (server:record->url server-info)) +;; ;; ;; (remote-server-id-set! *runremote* (server:record->id server-info))))) +;; ;; ;; (remote-force-server-set! runremote (common:force-server?)) +;; ;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") +;; ;; ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) +;; ;; +;; ;; ;;DOT CASE7 [label="homehost\nwrite"]; +;; ;; ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; +;; ;; ;;DOT CASE7 -> "rmt:open-qry-close-locally"; +;; ;; ;; on homehost and this is a write, we already have a server +;; ;; ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required +;; ;; (cdr (remote-hh-dat runremote)) ;; on homehost +;; ;; (not (member cmd api:read-only-queries)) ;; this is a write +;; ;; (remote-server-url runremote)) ;; have a server +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") +;; ;; (rmt:open-qry-close-locally cmd 0 params)) +;; ;; +;; ;; ;;DOT CASE8 [label="force\nserver"]; +;; ;; ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8}; +;; ;; ;;DOT CASE8 -> "rmt:open-qry-close-locally"; +;; ;; ;; on homehost, no server contact made and this is a write, passively start a server +;; ;; ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required +;; ;; (cdr (remote-hh-dat runremote)) ;; have homehost +;; ;; (not (remote-server-url runremote)) ;; no connection yet +;; ;; (not (member cmd api:read-only-queries))) ;; not a read-only query +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") +;; ;; (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call +;; ;; (if server-info +;; ;; (begin +;; ;; (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed +;; ;; (remote-server-id-set! runremote (server:record->id server-info))) +;; ;; (if (common:force-server?) +;; ;; (server:start-and-wait *toppath*) +;; ;; (server:kind-run *toppath*))) +;; ;; (remote-force-server-set! runremote (common:force-server?)) +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") +;; ;; (rmt:open-qry-close-locally cmd 0 params))) +;; ;; +;; ;; ;;DOT CASE9 [label="force server\nnot on homehost"]; +;; ;; ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; +;; ;; ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; +;; ;; ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one +;; ;; (not (remote-conndat runremote))) +;; ;; (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost +;; ;; (not (remote-conndat runremote)))) ;; and no connection +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? +;; ;; (server:start-and-wait *toppath*)) +;; ;; (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http +;; ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as +;; ;; +;; ;; ;;DOT CASE10 [label="on homehost"]; +;; ;; ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; +;; ;; ;;DOT CASE10 -> "rmt:open-qry-close-locally"; +;; ;; ;; all set up if get this far, dispatch the query +;; ;; ((and (not (remote-force-server runremote)) +;; ;; (cdr (remote-hh-dat runremote))) ;; we are on homehost +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10") +;; ;; (rmt:open-qry-close-locally cmd (if rid rid 0) params)) +;; ;; +;; ;; ;;DOT CASE11 [label="send_receive"]; +;; ;; ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; +;; ;; ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; +;; ;; ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; +;; ;; ;; not on homehost, do server query +;; ;; (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))) +;; ;; ;;DOT } +;; ;; +;; ;; ;; bunch of small functions factored out of send-receive to make debug easier +;; ;; ;; +;; ;; +;; ;; (define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid) +;; ;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") +;; ;; ;; (mutex-lock! *rmt-mutex*) +;; ;; (let* ((conninfo (remote-conndat runremote)) +;; ;; (dat-in (condition-case ;; handling here has +;; ;; ;; caused a lot of +;; ;; ;; problems. However it +;; ;; ;; is needed to deal with +;; ;; ;; attemtped +;; ;; ;; communication to +;; ;; ;; servers that have gone +;; ;; ;; away +;; ;; (http-transport:client-api-send-receive 0 runremote cmd params) +;; ;; ;; (http-transport:client-api-send-receive 0 conninfo cmd params runremote) +;; ;; ((servermismatch) (vector #f "Server id mismatch" )) +;; ;; ((commfail)(vector #f "communications fail")) +;; ;; ((exn)(vector #f "other fail" (print-call-chain))))) +;; ;; (dat (if (and (vector? dat-in) ;; ... check it is a correct size +;; ;; (> (vector-length dat-in) 1)) +;; ;; dat-in +;; ;; (vector #f (conc "communications fail (type 2), dat-in=" dat-in)))) +;; ;; (success (if (vector? dat) (vector-ref dat 0) #f)) +;; ;; (res (if (vector? dat) (vector-ref dat 1) #f))) +;; ;; (if (and (vector? conninfo) (< 5 (vector-length conninfo))) +;; ;; (remote-last-access-set! runremote (current-seconds)) ;; refresh access time +;; ;; (begin +;; ;; (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo) +;; ;; (set! conninfo #f) +;; ;; (http-transport:close-connections runremote))) +;; ;; (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (if success ;; success only tells us that the transport was +;; ;; ;; successful, have to examine the data to see if +;; ;; ;; there was a detected issue at the other end +;; ;; (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd) +;; ;; (begin +;; ;; (debug:print-error 0 *default-log-port* " dat=" dat) +;; ;; (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params)) +;; ;; ))) +;; +;; (define (rmt:print-db-stats) +;; (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" +;; (debug:print 18 *default-log-port* "DB Stats\n========") +;; (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) +;; (for-each (lambda (cmd) +;; (let ((cmd-dat (hash-table-ref *db-stats* cmd))) +;; (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0)))))) +;; (sort (hash-table-keys *db-stats*) +;; (lambda (a b) +;; (> (vector-ref (hash-table-ref *db-stats* a) 0) +;; (vector-ref (hash-table-ref *db-stats* b) 0))))))) +;; +;; (define (rmt:get-max-query-average run-id) +;; (mutex-lock! *db-stats-mutex*) +;; (let* ((runkey (conc "run-id=" run-id " ")) +;; (cmds (filter (lambda (x) +;; (substring-index runkey x)) +;; (hash-table-keys *db-stats*))) +;; (res (if (null? cmds) +;; (cons 'none 0) +;; (let loop ((cmd (car cmds)) +;; (tal (cdr cmds)) +;; (max-cmd (car cmds)) +;; (res 0)) +;; (let* ((cmd-dat (hash-table-ref *db-stats* cmd)) +;; (tot (vector-ref cmd-dat 0)) +;; (curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction +;; (currmax (max res curravg)) +;; (newmax-cmd (if (> curravg res) cmd max-cmd))) +;; (if (null? tal) +;; (if (> tot 10) +;; (cons newmax-cmd currmax) +;; (cons 'none 0)) +;; (loop (car tal)(cdr tal) newmax-cmd currmax))))))) +;; (mutex-unlock! *db-stats-mutex*) +;; res)) +;; +;; (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) +;; (let* ((qry-is-write (not (member cmd api:read-only-queries))) +;; (db-file-path (db:dbfile-path)) ;; 0)) +;; (dbstructs-local (db:setup #t)) ;; make-dbr:dbstruct path: dbdir local: #t))) +;; (read-only (not (file-write-access? db-file-path))) +;; (start (current-milliseconds)) +;; (resdat (if (not (and read-only qry-is-write)) +;; (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params)))) +;; ;; (handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong.. +;; ;; exn ;; This is an attempt to detect that situation and recover gracefully +;; ;; (begin +;; ;; (debug:print 0 *default-log-port* "ERROR: bad data from server " v " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) +;; ;; (vector #t '())) ;; should always get a vector but if something goes wrong return a dummy +;; (if (and (vector? v) +;; (> (vector-length v) 1)) +;; (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) +;; newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record +;; (vector #t '()))) ;; ) ;; we could also check that the returned types are valid +;; (vector #t '()))) +;; (success (vector-ref resdat 0)) +;; (res (vector-ref resdat 1)) +;; (duration (- (current-milliseconds) start))) +;; (if (and read-only qry-is-write) +;; (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd)) +;; (if (not success) +;; (if (> remretries 0) +;; (begin +;; (debug:print-error 0 *default-log-port* "local query failed. Trying again.") +;; (thread-sleep! (/ (random 5000) 1000)) ;; some random delay +;; (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) +;; (begin +;; (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up") +;; #f)) +;; (begin +;; ;; (rmt:update-db-stats run-id cmd params duration) +;; ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it +;; (if qry-is-write +;; (let ((start-time (current-seconds))) +;; (mutex-lock! *db-multi-sync-mutex*) +;; / (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) +;; (mutex-unlock! *db-multi-sync-mutex*))))) +;; res)) +;; +;; (define (rmt:send-receive-no-auto-client-setup runremote cmd run-id params) +;; (let* ((run-id (if run-id run-id 0)) +;; (res (http-transport:client-api-send-receive run-id runremote cmd params))) +;; (if (and res (vector-ref res 0)) +;; (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! +;; #f))) +;; +;; ;;====================================================================== +;; ;; +;; ;; A C T U A L A P I C A L L S +;; ;; +;; ;;====================================================================== +;; +;; ;;====================================================================== +;; ;; S E R V E R +;; ;;====================================================================== +;; +;; (define (rmt:kill-server run-id) +;; (rmt:send-receive 'kill-server run-id (list run-id))) +;; +;; (define (rmt:start-server run-id) +;; (rmt:send-receive 'start-server 0 (list run-id))) +;; +;; ;;====================================================================== +;; ;; M I S C +;; ;;====================================================================== +;; +;; (define (rmt:login run-id) +;; (rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature)))) +;; +;; ;; This login does no retries under the hood - it acts a bit like a ping. +;; ;; Deprecated for nmsg-transport. +;; ;; +;; (define (rmt:login-no-auto-client-setup runremote) +;; (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature)))) +;; +;; ;; hand off a call to one of the db:queries statements +;; ;; added run-id to make looking up the correct db possible +;; ;; +;; (define (rmt:general-call stmtname run-id . params) +;; (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params))) +;; +;; +;; ;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host +;; (define (rmt:get-latest-host-load hostname) +;; (rmt:send-receive 'get-latest-host-load 0 (list hostname))) +;; +;; (define (rmt:sdb-qry qry val run-id) +;; ;; add caching if qry is 'getid or 'getstr +;; (rmt:send-receive 'sdb-qry run-id (list qry val))) +;; +;; ;; NOT COMPLETED +;; (define (rmt:runtests user run-id testpatt params) +;; (rmt:send-receive 'runtests run-id testpatt)) +;; +;; (define (rmt:get-run-record-ids target run keynames test-patt) +;; (rmt:send-receive 'get-run-record-ids #f (list target run keynames test-patt))) +;; +;; (define (rmt:get-changed-record-ids since-time) +;; (rmt:send-receive 'get-changed-record-ids #f (list since-time)) ) +;; +;; (define (rmt:drop-all-triggers) +;; (rmt:send-receive 'drop-all-triggers #f '())) +;; +;; (define (rmt:create-all-triggers) +;; (rmt:send-receive 'create-all-triggers #f '())) +;; +;; ;;====================================================================== +;; ;; T E S T M E T A +;; ;;====================================================================== +;; +;; (define (rmt:get-tests-tags) +;; (rmt:send-receive 'get-tests-tags #f '())) +;; +;; ;;====================================================================== +;; ;; K E Y S +;; ;;====================================================================== +;; +;; ;; These require run-id because the values come from the run! +;; ;; +;; (define (rmt:get-key-val-pairs run-id) +;; (rmt:send-receive 'get-key-val-pairs run-id (list run-id))) +;; +;; (define (rmt:get-keys) +;; (if *db-keys* *db-keys* +;; (let ((res (rmt:send-receive 'get-keys #f '()))) +;; (set! *db-keys* res) +;; res))) +;; +;; (define (rmt:get-keys-write) ;; dummy query to force server start +;; (let ((res (rmt:send-receive 'get-keys-write #f '()))) +;; (set! *db-keys* res) +;; res)) +;; +;; ;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe +;; ;; to cache the resuls in a hash +;; ;; +;; (define (rmt:get-key-vals run-id) +;; (or (hash-table-ref/default *keyvals* run-id #f) +;; (let ((res (rmt:send-receive 'get-key-vals #f (list run-id)))) +;; (hash-table-set! *keyvals* run-id res) +;; res))) +;; +;; (define (rmt:get-targets) +;; (rmt:send-receive 'get-targets #f '())) +;; +;; (define (rmt:get-target run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-target run-id (list run-id))) +;; +;; (define (rmt:get-run-times runpatt targetpatt) +;; (rmt:send-receive 'get-run-times #f (list runpatt targetpatt ))) +;; +;; +;; ;;====================================================================== +;; ;; T E S T S +;; ;;====================================================================== +;; +;; ;; Just some syntatic sugar +;; (define (rmt:register-test run-id test-name item-path) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:general-call 'register-test run-id run-id test-name item-path)) +;; +;; (define (rmt:get-test-id run-id testname item-path) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) +;; +;; ;; run-id is NOT used +;; ;; +;; (define (rmt:get-test-info-by-id run-id test-id) +;; (if (number? test-id) +;; (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) +;; (begin +;; (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) +;; (print-call-chain (current-error-port)) +;; #f))) +;; +;; (define (rmt:test-get-rundir-from-test-id run-id test-id) +;; (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) +;; +;; (define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f)) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (let* ((test-path (if (string? work-area) +;; work-area +;; (rmt:test-get-rundir-from-test-id run-id test-id)))) +;; (debug:print 3 *default-log-port* "TEST PATH: " test-path) +;; (open-test-db test-path))) +;; +;; ;; WARNING: This currently bypasses the transaction wrapped writes system +;; (define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment))) +;; +;; (define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus))) +;; +;; (define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) +;; (assert (number? run-id) "FATAL: Run id required.") +;; ;; (if (number? run-id) +;; (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode))) +;; ;; (begin +;; ;; (debug:print-error 0 *default-log-port* "rmt:get-tests-for-run called with bad run-id=" run-id) +;; ;; (print-call-chain (current-error-port)) +;; ;; '()))) +;; +;; (define (rmt:get-tests-for-run-state-status run-id testpatt last-update) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-tests-for-run-state-status run-id (list run-id testpatt last-update))) +;; +;; ;; get stuff via synchash +;; (define (rmt:synchash-get run-id proc synckey keynum params) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'synchash-get run-id (list run-id proc synckey keynum params))) +;; +;; (define (rmt:get-tests-for-run-mindata run-id testpatt states status not-in) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-id testpatt states status not-in))) +;; ;; ;; IDEA: Threadify these - they spend a lot of time waiting ... ;; ;; ;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) -;; (let ((run-id-list (if run-ids +;; (let ((multi-run-mutex (make-mutex)) +;; (run-id-list (if run-ids ;; run-ids -;; (rmt:get-all-run-ids)))) -;; (apply append (map (lambda (run-id) -;; (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-ids testpatt states status not-in))) -;; run-id-list)))) - -(define (rmt:delete-test-records run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'delete-test-records run-id (list run-id test-id))) - -(define (rmt:test-set-state-status run-id test-id state status msg) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-set-state-status run-id (list run-id test-id state status msg))) - -(define (rmt:test-toplevel-num-items run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-toplevel-num-items run-id (list run-id test-name))) - -;; (define (rmt:get-previous-test-run-record run-id test-name item-path) -;; (rmt:send-receive 'get-previous-test-run-record run-id (list run-id test-name item-path))) - -(define (rmt:get-matching-previous-test-run-records run-id test-name item-path) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-matching-previous-test-run-records run-id (list run-id test-name item-path))) - -(define (rmt:test-get-logfile-info run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-get-logfile-info run-id (list run-id test-name))) - -(define (rmt:test-get-records-for-index-file run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-get-records-for-index-file run-id (list run-id test-name))) - -(define (rmt:get-testinfo-state-status run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-testinfo-state-status run-id (list run-id test-id))) - -(define (rmt:test-set-log! run-id test-id logf) - (assert (number? run-id) "FATAL: Run id required.") - (if (string? logf)(rmt:general-call 'test-set-log run-id logf test-id))) - -(define (rmt:test-set-top-process-pid run-id test-id pid) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-set-top-process-pid run-id (list run-id test-id pid))) - -(define (rmt:test-get-top-process-pid run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-get-top-process-pid run-id (list run-id test-id))) - -(define (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt) - (rmt:send-receive 'get-run-ids-matching-target #f (list keynames target res runname testpatt statepatt statuspatt))) - -;; NOTE: This will open and access ALL run databases. -;; -(define (rmt:test-get-paths-matching-keynames-target-new keynames target res testpatt statepatt statuspatt runname) - (let ((run-ids (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt))) - (apply append - (map (lambda (run-id) - (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) - run-ids)))) - -(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) - -(define (rmt:get-count-tests-running-for-run-id run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) - -(define (rmt:get-not-completed-cnt run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) - - -;; Statistical queries - -(define (rmt:get-count-tests-running run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-count-tests-running run-id (list run-id))) - -(define (rmt:get-count-tests-running-for-testname run-id testname) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname))) - -(define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup))) - -;; state and status are extra hints not usually used in the calculation -;; -(define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment))) - -(define (rmt:set-state-status-and-roll-up-run run-id state status) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status))) - - -(define (rmt:update-pass-fail-counts run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name)) - -(define (rmt:top-test-set-per-pf-counts run-id test-name) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name))) - -(define (rmt:get-raw-run-stats run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-raw-run-stats run-id (list run-id))) - -(define (rmt:get-test-times runname target) - (rmt:send-receive 'get-test-times #f (list runname target ))) - -;;====================================================================== -;; R U N S -;;====================================================================== - -;; BUG - LOOK AT HOW THIS WORKS!!! -;; -(define (rmt:get-run-info run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-run-info #f (list run-id))) - -(define (rmt:get-num-runs runpatt) - (rmt:send-receive 'get-num-runs #f (list runpatt))) - -(define (rmt:get-runs-cnt-by-patt runpatt targetpatt keys) - (rmt:send-receive 'get-runs-cnt-by-patt #f (list runpatt targetpatt keys))) - -;; Use the special run-id == #f scenario here since there is no run yet -(define (rmt:register-run keyvals runname state status user contour) - (rmt:send-receive 'register-run #f (list keyvals runname state status user contour))) - -(define (rmt:get-run-name-from-id run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-run-name-from-id #f (list run-id))) - -(define (rmt:delete-run run-id) - (rmt:send-receive 'delete-run #f (list run-id))) - -(define (rmt:update-run-stats run-id stats) - (rmt:send-receive 'update-run-stats #f (list run-id stats))) - -(define (rmt:delete-old-deleted-test-records) - (rmt:send-receive 'delete-old-deleted-test-records #f '())) - -(define (rmt:get-runs runpatt count offset keypatts) - (rmt:send-receive 'get-runs #f (list runpatt count offset keypatts))) - -(define (rmt:simple-get-runs runpatt count offset target last-update) - (rmt:send-receive 'simple-get-runs #f (list runpatt count offset target last-update))) - -(define (rmt:get-all-run-ids) - (rmt:send-receive 'get-all-run-ids #f '())) - -(define (rmt:get-prev-run-ids run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-prev-run-ids #f (list run-id))) - -(define (rmt:lock/unlock-run run-id lock unlock user) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) - -;; set/get status -(define (rmt:get-run-status run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-run-status #f (list run-id))) - -(define (rmt:get-run-state run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-run-state #f (list run-id))) - - -(define (rmt:set-run-status run-id run-status #!key (msg #f)) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) - -(define (rmt:set-run-state-status run-id state status ) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'set-run-state-status #f (list run-id state status))) - -(define (rmt:update-tesdata-on-repilcate-db old-lt new-lt) -(rmt:send-receive 'update-tesdata-on-repilcate-db #f (list old-lt new-lt))) - -(define (rmt:update-run-event_time run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'update-run-event_time #f (list run-id))) - -(define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default - (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order))) - -(define (rmt:find-and-mark-incomplete run-id ovr-deadtime) - (assert (number? run-id) "FATAL: Run id required.") - ;; (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime)) - (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))) ;; ) - -(define (rmt:get-main-run-stats run-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-main-run-stats #f (list run-id))) - -(define (rmt:get-var varname) - (rmt:send-receive 'get-var #f (list varname))) - -(define (rmt:del-var varname) - (rmt:send-receive 'del-var #f (list varname))) - -(define (rmt:set-var varname value) - (rmt:send-receive 'set-var #f (list varname value))) - -(define (rmt:inc-var varname) - (rmt:send-receive 'inc-var #f (list varname))) - -(define (rmt:dec-var varname) - (rmt:send-receive 'dec-var #f (list varname))) - -(define (rmt:add-var varname value) - (rmt:send-receive 'add-var #f (list varname value))) - -;;====================================================================== -;; M U L T I R U N Q U E R I E S -;;====================================================================== - -;; Need to move this to multi-run section and make associated changes -(define (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f)) - (let ((run-ids (rmt:get-all-run-ids))) - (for-each (lambda (run-id) - (rmt:find-and-mark-incomplete run-id ovr-deadtime)) - run-ids))) - -;; get the previous record for when this test was run where all keys match but runname -;; returns #f if no such test found, returns a single test record if found -;; -;; Run this at the client end since we have to connect to multiple run-id dbs -;; -(define (rmt:get-previous-test-run-record run-id test-name item-path) - (let* ((keyvals (rmt:get-key-val-pairs run-id)) - (keys (rmt:get-keys)) - (selstr (string-intersperse keys ",")) - (qrystr (string-intersperse (map (lambda (x)(conc x "=?")) keys) " AND "))) - (if (not keyvals) - #f - (let ((prev-run-ids (rmt:get-prev-run-ids run-id))) - ;; for each run starting with the most recent look to see if there is a matching test - ;; if found then return that matching test record - (debug:print 4 *default-log-port* "selstr: " selstr ", qrystr: " qrystr ", keyvals: " keyvals ", previous run ids found: " prev-run-ids) - (if (null? prev-run-ids) #f - (let loop ((hed (car prev-run-ids)) - (tal (cdr prev-run-ids))) - (let ((results (rmt:get-tests-for-run hed (conc test-name "/" item-path) '() '() ;; run-id testpatt states statuses - #f #f #f ;; offset limit not-in hide/not-hide - #f #f #f #f 'normal))) ;; sort-by sort-order qryvals last-update mode - (debug:print 4 *default-log-port* "Got tests for run-id " run-id ", test-name " test-name ", item-path " item-path ": " results) - (if (and (null? results) - (not (null? tal))) - (loop (car tal)(cdr tal)) - (if (null? results) #f - (car results)))))))))) - -(define (rmt:get-run-stats) - (rmt:send-receive 'get-run-stats #f '())) - -;;====================================================================== -;; S T E P S -;;====================================================================== - -;; Getting steps is more complicated. -;; -;; If given work area -;; 1. Find the testdat.db file -;; 2. Open the testdat.db file and do the query -;; If not given the work area -;; 1. Do a remote call to get the test path -;; 2. Continue as above -;; -;;(define (rmt:get-steps-for-test run-id test-id) -;; (rmt:send-receive 'get-steps-data run-id (list test-id))) - -(define (rmt:teststep-set-status! run-id test-id teststep-name state-in status-in comment logfile) - (assert (number? run-id) "FATAL: Run id required.") - (let* ((state (items:check-valid-items "state" state-in)) - (status (items:check-valid-items "status" status-in))) - (if (or (not state)(not status)) - (debug:print 3 *default-log-port* "WARNING: Invalid " (if status "status" "state") - " value \"" (if status state-in status-in) "\", update your validvalues section in megatest.config")) - (rmt:send-receive 'teststep-set-status! run-id (list run-id test-id teststep-name state-in status-in comment logfile)))) - - -(define (rmt:delete-steps-for-test! run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'delete-steps-for-test! run-id (list run-id test-id))) - -(define (rmt:get-steps-for-test run-id test-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-steps-for-test run-id (list run-id test-id))) - -(define (rmt:get-steps-info-by-id run-id test-step-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-steps-info-by-id #f (list run-id test-step-id))) - -;;====================================================================== -;; T E S T D A T A -;;====================================================================== - -(define (rmt:read-test-data run-id test-id categorypatt #!key (work-area #f)) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'read-test-data run-id (list run-id test-id categorypatt))) - -(define (rmt:read-test-data-varpatt run-id test-id categorypatt varpatt #!key (work-area #f)) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'read-test-data-varpatt run-id (list run-id test-id categorypatt varpatt))) - -(define (rmt:get-data-info-by-id run-id test-data-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'get-data-info-by-id #f (list run-id test-data-id))) - -(define (rmt:testmeta-add-record testname) - (rmt:send-receive 'testmeta-add-record #f (list testname))) - -(define (rmt:testmeta-get-record testname) - (rmt:send-receive 'testmeta-get-record #f (list testname))) - -(define (rmt:testmeta-update-field test-name fld val) - (rmt:send-receive 'testmeta-update-field #f (list test-name fld val))) - -(define (rmt:test-data-rollup run-id test-id status) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-data-rollup run-id (list run-id test-id status))) - -(define (rmt:csv->test-data run-id test-id csvdata) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'csv->test-data run-id (list run-id test-id csvdata))) - -;;====================================================================== -;; T A S K S -;;====================================================================== - -(define (rmt:tasks-find-task-queue-records target run-name test-patt state-patt action-patt) - (rmt:send-receive 'find-task-queue-records #f (list target run-name test-patt state-patt action-patt))) - -(define (rmt:tasks-add action owner target runname testpatt params) - (rmt:send-receive 'tasks-add #f (list action owner target runname testpatt params))) - -(define (rmt:tasks-set-state-given-param-key param-key new-state) - (rmt:send-receive 'tasks-set-state-given-param-key #f (list param-key new-state))) - -(define (rmt:tasks-get-last target runname) - (rmt:send-receive 'tasks-get-last #f (list target runname))) - -;;====================================================================== -;; N O S Y N C D B -;;====================================================================== - -(define (rmt:no-sync-set var val) - (rmt:send-receive 'no-sync-set #f `(,var ,val))) - -(define (rmt:no-sync-get/default var default) - (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) - -(define (rmt:no-sync-del! var) - (rmt:send-receive 'no-sync-del! #f `(,var))) - -(define (rmt:no-sync-get-lock keyname) - (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) - -;;====================================================================== -;; A R C H I V E S -;;====================================================================== - -(define (rmt:archive-get-allocations testname itempath dneeded) - (rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded))) - -(define (rmt:archive-register-block-name bdisk-id archive-path) - (rmt:send-receive 'archive-register-block-name #f (list bdisk-id archive-path))) - -(define (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey) - (rmt:send-receive 'archive-allocate-test-to-block #f (list block-id testsuite-name areakey))) - -(define (rmt:archive-register-disk bdisk-name bdisk-path df) - (rmt:send-receive 'archive-register-disk #f (list bdisk-name bdisk-path df))) - -(define (rmt:test-set-archive-block-id run-id test-id archive-block-id) - (assert (number? run-id) "FATAL: Run id required.") - (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id))) - -(define (rmt:test-get-archive-block-info archive-block-id) - (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id))) - - -(define (rmtmod:calc-ro-mode runremote *toppath*) - (if (and runremote - (remote-ro-mode-checked runremote)) - (remote-ro-mode runremote) - (let* ((mtcfgfile (conc *toppath* "/megatest.config")) - (ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future - (if runremote - (begin - (remote-ro-mode-set! runremote ro-mode) - (remote-ro-mode-checked-set! runremote #t) - ro-mode) - ro-mode)))) - -(define (extras-readonly-mode rmt-mutex log-port cmd params) - (mutex-unlock! rmt-mutex) - (debug:print-info 12 log-port "rmt:send-receive, case 3") - (debug:print 0 log-port "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params) - #f) - -(define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params) - (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) - (mutex-lock! *rmt-mutex*) - (http-transport:close-connections runremote) - (remote-server-url-set! runremote #f) - (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") - (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) - -(define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd) - (if (and (vector? res) - (eq? (vector-length res) 2) - (eq? (vector-ref res 1) 'overloaded)) ;; since we are - ;; looking at the - ;; data to carry the - ;; error we'll use a - ;; fairly obtuse - ;; combo to minimise - ;; the chances of - ;; some sort of - ;; collision. this - ;; is the case where - ;; the returned data - ;; is bad or the - ;; server is - ;; overloaded and we - ;; want to ease off - ;; the queries - (let ((wait-delay (+ attemptnum (* attemptnum 10)))) - (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") - (mutex-lock! *rmt-mutex*) - (http-transport:close-connections runremote) - (set! *runremote* #f) ;; force starting over - (mutex-unlock! *rmt-mutex*) - (thread-sleep! wait-delay) - (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) - res)) ;; All good, return res - -#;(set-functions rmt:send-receive remote-server-url-set! - http-transport:close-connections remote-conndat-set! - debug:print debug:print-info - remote-ro-mode remote-ro-mode-set! - remote-ro-mode-checked-set! remote-ro-mode-checked) +;; (rmt:get-all-run-ids))) +;; (result '())) +;; (if (null? run-id-list) +;; '() +;; (let loop ((hed (car run-id-list)) +;; (tal (cdr run-id-list)) +;; (threads '())) +;; (if (> (length threads) 5) +;; (loop hed tal (filter (lambda (th)(not (member (thread-state th) '(terminated dead)))) threads)) +;; (let* ((newthread (make-thread +;; (lambda () +;; (let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in)))) +;; (if (list? res) +;; (begin +;; (mutex-lock! multi-run-mutex) +;; (set! result (append result res)) +;; (mutex-unlock! multi-run-mutex)) +;; (debug:print-error 0 *default-log-port* "get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in)))) +;; (conc "multi-run-thread for run-id " hed))) +;; (newthreads (cons newthread threads))) +;; (thread-start! newthread) +;; (thread-sleep! 0.05) ;; give that thread some time to start +;; (if (null? tal) +;; newthreads +;; (loop (car tal)(cdr tal) newthreads)))))) +;; result)) +;; +;; ;; ;; IDEA: Threadify these - they spend a lot of time waiting ... +;; ;; ;; +;; ;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) +;; ;; (let ((run-id-list (if run-ids +;; ;; run-ids +;; ;; (rmt:get-all-run-ids)))) +;; ;; (apply append (map (lambda (run-id) +;; ;; (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-ids testpatt states status not-in))) +;; ;; run-id-list)))) +;; +;; (define (rmt:delete-test-records run-id test-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'delete-test-records run-id (list run-id test-id))) +;; +;; (define (rmt:test-set-state-status run-id test-id state status msg) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'test-set-state-status run-id (list run-id test-id state status msg))) +;; +;; (define (rmt:test-toplevel-num-items run-id test-name) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'test-toplevel-num-items run-id (list run-id test-name))) +;; +;; ;; (define (rmt:get-previous-test-run-record run-id test-name item-path) +;; ;; (rmt:send-receive 'get-previous-test-run-record run-id (list run-id test-name item-path))) +;; +;; (define (rmt:get-matching-previous-test-run-records run-id test-name item-path) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-matching-previous-test-run-records run-id (list run-id test-name item-path))) +;; +;; (define (rmt:test-get-logfile-info run-id test-name) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'test-get-logfile-info run-id (list run-id test-name))) +;; +;; (define (rmt:test-get-records-for-index-file run-id test-name) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'test-get-records-for-index-file run-id (list run-id test-name))) +;; +;; (define (rmt:get-testinfo-state-status run-id test-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-testinfo-state-status run-id (list run-id test-id))) +;; +;; (define (rmt:test-set-log! run-id test-id logf) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (if (string? logf)(rmt:general-call 'test-set-log run-id logf test-id))) +;; +;; (define (rmt:test-set-top-process-pid run-id test-id pid) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'test-set-top-process-pid run-id (list run-id test-id pid))) +;; +;; (define (rmt:test-get-top-process-pid run-id test-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'test-get-top-process-pid run-id (list run-id test-id))) +;; +;; (define (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt) +;; (rmt:send-receive 'get-run-ids-matching-target #f (list keynames target res runname testpatt statepatt statuspatt))) +;; +;; ;; NOTE: This will open and access ALL run databases. +;; ;; +;; (define (rmt:test-get-paths-matching-keynames-target-new keynames target res testpatt statepatt statuspatt runname) +;; (let ((run-ids (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt))) +;; (apply append +;; (map (lambda (run-id) +;; (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) +;; run-ids)))) +;; +;; (define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) +;; +;; (define (rmt:get-count-tests-running-for-run-id run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) +;; +;; (define (rmt:get-not-completed-cnt run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) +;; +;; +;; ;; Statistical queries +;; +;; (define (rmt:get-count-tests-running run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-count-tests-running run-id (list run-id))) +;; +;; (define (rmt:get-count-tests-running-for-testname run-id testname) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname))) +;; +;; (define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup))) +;; +;; ;; state and status are extra hints not usually used in the calculation +;; ;; +;; (define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment))) +;; +;; (define (rmt:set-state-status-and-roll-up-run run-id state status) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status))) +;; +;; +;; (define (rmt:update-pass-fail-counts run-id test-name) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name)) +;; +;; (define (rmt:top-test-set-per-pf-counts run-id test-name) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name))) +;; +;; (define (rmt:get-raw-run-stats run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-raw-run-stats run-id (list run-id))) +;; +;; (define (rmt:get-test-times runname target) +;; (rmt:send-receive 'get-test-times #f (list runname target ))) +;; +;; ;;====================================================================== +;; ;; R U N S +;; ;;====================================================================== +;; +;; ;; BUG - LOOK AT HOW THIS WORKS!!! +;; ;; +;; (define (rmt:get-run-info run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-run-info #f (list run-id))) +;; +;; (define (rmt:get-num-runs runpatt) +;; (rmt:send-receive 'get-num-runs #f (list runpatt))) +;; +;; (define (rmt:get-runs-cnt-by-patt runpatt targetpatt keys) +;; (rmt:send-receive 'get-runs-cnt-by-patt #f (list runpatt targetpatt keys))) +;; +;; ;; Use the special run-id == #f scenario here since there is no run yet +;; (define (rmt:register-run keyvals runname state status user contour) +;; (rmt:send-receive 'register-run #f (list keyvals runname state status user contour))) +;; +;; (define (rmt:get-run-name-from-id run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-run-name-from-id #f (list run-id))) +;; +;; (define (rmt:delete-run run-id) +;; (rmt:send-receive 'delete-run #f (list run-id))) +;; +;; (define (rmt:update-run-stats run-id stats) +;; (rmt:send-receive 'update-run-stats #f (list run-id stats))) +;; +;; (define (rmt:delete-old-deleted-test-records) +;; (rmt:send-receive 'delete-old-deleted-test-records #f '())) +;; +;; (define (rmt:get-runs runpatt count offset keypatts) +;; (rmt:send-receive 'get-runs #f (list runpatt count offset keypatts))) +;; +;; (define (rmt:simple-get-runs runpatt count offset target last-update) +;; (rmt:send-receive 'simple-get-runs #f (list runpatt count offset target last-update))) +;; +;; (define (rmt:get-all-run-ids) +;; (rmt:send-receive 'get-all-run-ids #f '())) +;; +;; (define (rmt:get-prev-run-ids run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-prev-run-ids #f (list run-id))) +;; +;; (define (rmt:lock/unlock-run run-id lock unlock user) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) +;; +;; ;; set/get status +;; (define (rmt:get-run-status run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-run-status #f (list run-id))) +;; +;; (define (rmt:get-run-state run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-run-state #f (list run-id))) +;; +;; +;; (define (rmt:set-run-status run-id run-status #!key (msg #f)) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) +;; +;; (define (rmt:set-run-state-status run-id state status ) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'set-run-state-status #f (list run-id state status))) +;; +;; (define (rmt:update-tesdata-on-repilcate-db old-lt new-lt) +;; (rmt:send-receive 'update-tesdata-on-repilcate-db #f (list old-lt new-lt))) +;; +;; (define (rmt:update-run-event_time run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'update-run-event_time #f (list run-id))) +;; +;; (define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default +;; (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order))) +;; +;; (define (rmt:find-and-mark-incomplete run-id ovr-deadtime) +;; (assert (number? run-id) "FATAL: Run id required.") +;; ;; (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime)) +;; (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))) ;; ) +;; +;; (define (rmt:get-main-run-stats run-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-main-run-stats #f (list run-id))) +;; +;; (define (rmt:get-var varname) +;; (rmt:send-receive 'get-var #f (list varname))) +;; +;; (define (rmt:del-var varname) +;; (rmt:send-receive 'del-var #f (list varname))) +;; +;; (define (rmt:set-var varname value) +;; (rmt:send-receive 'set-var #f (list varname value))) +;; +;; (define (rmt:inc-var varname) +;; (rmt:send-receive 'inc-var #f (list varname))) +;; +;; (define (rmt:dec-var varname) +;; (rmt:send-receive 'dec-var #f (list varname))) +;; +;; (define (rmt:add-var varname value) +;; (rmt:send-receive 'add-var #f (list varname value))) +;; +;; ;;====================================================================== +;; ;; M U L T I R U N Q U E R I E S +;; ;;====================================================================== +;; +;; ;; Need to move this to multi-run section and make associated changes +;; (define (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f)) +;; (let ((run-ids (rmt:get-all-run-ids))) +;; (for-each (lambda (run-id) +;; (rmt:find-and-mark-incomplete run-id ovr-deadtime)) +;; run-ids))) +;; +;; ;; get the previous record for when this test was run where all keys match but runname +;; ;; returns #f if no such test found, returns a single test record if found +;; ;; +;; ;; Run this at the client end since we have to connect to multiple run-id dbs +;; ;; +;; (define (rmt:get-previous-test-run-record run-id test-name item-path) +;; (let* ((keyvals (rmt:get-key-val-pairs run-id)) +;; (keys (rmt:get-keys)) +;; (selstr (string-intersperse keys ",")) +;; (qrystr (string-intersperse (map (lambda (x)(conc x "=?")) keys) " AND "))) +;; (if (not keyvals) +;; #f +;; (let ((prev-run-ids (rmt:get-prev-run-ids run-id))) +;; ;; for each run starting with the most recent look to see if there is a matching test +;; ;; if found then return that matching test record +;; (debug:print 4 *default-log-port* "selstr: " selstr ", qrystr: " qrystr ", keyvals: " keyvals ", previous run ids found: " prev-run-ids) +;; (if (null? prev-run-ids) #f +;; (let loop ((hed (car prev-run-ids)) +;; (tal (cdr prev-run-ids))) +;; (let ((results (rmt:get-tests-for-run hed (conc test-name "/" item-path) '() '() ;; run-id testpatt states statuses +;; #f #f #f ;; offset limit not-in hide/not-hide +;; #f #f #f #f 'normal))) ;; sort-by sort-order qryvals last-update mode +;; (debug:print 4 *default-log-port* "Got tests for run-id " run-id ", test-name " test-name ", item-path " item-path ": " results) +;; (if (and (null? results) +;; (not (null? tal))) +;; (loop (car tal)(cdr tal)) +;; (if (null? results) #f +;; (car results)))))))))) +;; +;; (define (rmt:get-run-stats) +;; (rmt:send-receive 'get-run-stats #f '())) +;; +;; ;;====================================================================== +;; ;; S T E P S +;; ;;====================================================================== +;; +;; ;; Getting steps is more complicated. +;; ;; +;; ;; If given work area +;; ;; 1. Find the testdat.db file +;; ;; 2. Open the testdat.db file and do the query +;; ;; If not given the work area +;; ;; 1. Do a remote call to get the test path +;; ;; 2. Continue as above +;; ;; +;; ;;(define (rmt:get-steps-for-test run-id test-id) +;; ;; (rmt:send-receive 'get-steps-data run-id (list test-id))) +;; +;; (define (rmt:teststep-set-status! run-id test-id teststep-name state-in status-in comment logfile) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (let* ((state (items:check-valid-items "state" state-in)) +;; (status (items:check-valid-items "status" status-in))) +;; (if (or (not state)(not status)) +;; (debug:print 3 *default-log-port* "WARNING: Invalid " (if status "status" "state") +;; " value \"" (if status state-in status-in) "\", update your validvalues section in megatest.config")) +;; (rmt:send-receive 'teststep-set-status! run-id (list run-id test-id teststep-name state-in status-in comment logfile)))) +;; +;; +;; (define (rmt:delete-steps-for-test! run-id test-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'delete-steps-for-test! run-id (list run-id test-id))) +;; +;; (define (rmt:get-steps-for-test run-id test-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-steps-for-test run-id (list run-id test-id))) +;; +;; (define (rmt:get-steps-info-by-id run-id test-step-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-steps-info-by-id #f (list run-id test-step-id))) +;; +;; ;;====================================================================== +;; ;; T E S T D A T A +;; ;;====================================================================== +;; +;; (define (rmt:read-test-data run-id test-id categorypatt #!key (work-area #f)) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'read-test-data run-id (list run-id test-id categorypatt))) +;; +;; (define (rmt:read-test-data-varpatt run-id test-id categorypatt varpatt #!key (work-area #f)) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'read-test-data-varpatt run-id (list run-id test-id categorypatt varpatt))) +;; +;; (define (rmt:get-data-info-by-id run-id test-data-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'get-data-info-by-id #f (list run-id test-data-id))) +;; +;; (define (rmt:testmeta-add-record testname) +;; (rmt:send-receive 'testmeta-add-record #f (list testname))) +;; +;; (define (rmt:testmeta-get-record testname) +;; (rmt:send-receive 'testmeta-get-record #f (list testname))) +;; +;; (define (rmt:testmeta-update-field test-name fld val) +;; (rmt:send-receive 'testmeta-update-field #f (list test-name fld val))) +;; +;; (define (rmt:test-data-rollup run-id test-id status) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'test-data-rollup run-id (list run-id test-id status))) +;; +;; (define (rmt:csv->test-data run-id test-id csvdata) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'csv->test-data run-id (list run-id test-id csvdata))) +;; +;; ;;====================================================================== +;; ;; T A S K S +;; ;;====================================================================== +;; +;; (define (rmt:tasks-find-task-queue-records target run-name test-patt state-patt action-patt) +;; (rmt:send-receive 'find-task-queue-records #f (list target run-name test-patt state-patt action-patt))) +;; +;; (define (rmt:tasks-add action owner target runname testpatt params) +;; (rmt:send-receive 'tasks-add #f (list action owner target runname testpatt params))) +;; +;; (define (rmt:tasks-set-state-given-param-key param-key new-state) +;; (rmt:send-receive 'tasks-set-state-given-param-key #f (list param-key new-state))) +;; +;; (define (rmt:tasks-get-last target runname) +;; (rmt:send-receive 'tasks-get-last #f (list target runname))) +;; +;; ;;====================================================================== +;; ;; N O S Y N C D B +;; ;;====================================================================== +;; +;; (define (rmt:no-sync-set var val) +;; (rmt:send-receive 'no-sync-set #f `(,var ,val))) +;; +;; (define (rmt:no-sync-get/default var default) +;; (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) +;; +;; (define (rmt:no-sync-del! var) +;; (rmt:send-receive 'no-sync-del! #f `(,var))) +;; +;; (define (rmt:no-sync-get-lock keyname) +;; (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) +;; +;; ;;====================================================================== +;; ;; A R C H I V E S +;; ;;====================================================================== +;; +;; (define (rmt:archive-get-allocations testname itempath dneeded) +;; (rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded))) +;; +;; (define (rmt:archive-register-block-name bdisk-id archive-path) +;; (rmt:send-receive 'archive-register-block-name #f (list bdisk-id archive-path))) +;; +;; (define (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey) +;; (rmt:send-receive 'archive-allocate-test-to-block #f (list block-id testsuite-name areakey))) +;; +;; (define (rmt:archive-register-disk bdisk-name bdisk-path df) +;; (rmt:send-receive 'archive-register-disk #f (list bdisk-name bdisk-path df))) +;; +;; (define (rmt:test-set-archive-block-id run-id test-id archive-block-id) +;; (assert (number? run-id) "FATAL: Run id required.") +;; (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id))) +;; +;; (define (rmt:test-get-archive-block-info archive-block-id) +;; (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id))) +;; +;; +;; (define (rmtmod:calc-ro-mode runremote *toppath*) +;; (if (and runremote +;; (remote-ro-mode-checked runremote)) +;; (remote-ro-mode runremote) +;; (let* ((mtcfgfile (conc *toppath* "/megatest.config")) +;; (ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future +;; (if runremote +;; (begin +;; (remote-ro-mode-set! runremote ro-mode) +;; (remote-ro-mode-checked-set! runremote #t) +;; ro-mode) +;; ro-mode)))) +;; +;; (define (extras-readonly-mode rmt-mutex log-port cmd params) +;; (mutex-unlock! rmt-mutex) +;; (debug:print-info 12 log-port "rmt:send-receive, case 3") +;; (debug:print 0 log-port "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params) +;; #f) +;; +;; (define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params) +;; (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) +;; (mutex-lock! *rmt-mutex*) +;; (http-transport:close-connections runremote) +;; (remote-server-url-set! runremote #f) +;; (mutex-unlock! *rmt-mutex*) +;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") +;; (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) +;; +;; (define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd) +;; (if (and (vector? res) +;; (eq? (vector-length res) 2) +;; (eq? (vector-ref res 1) 'overloaded)) ;; since we are +;; ;; looking at the +;; ;; data to carry the +;; ;; error we'll use a +;; ;; fairly obtuse +;; ;; combo to minimise +;; ;; the chances of +;; ;; some sort of +;; ;; collision. this +;; ;; is the case where +;; ;; the returned data +;; ;; is bad or the +;; ;; server is +;; ;; overloaded and we +;; ;; want to ease off +;; ;; the queries +;; (let ((wait-delay (+ attemptnum (* attemptnum 10)))) +;; (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") +;; (mutex-lock! *rmt-mutex*) +;; (http-transport:close-connections runremote) +;; (set! *runremote* #f) ;; force starting over +;; (mutex-unlock! *rmt-mutex*) +;; (thread-sleep! wait-delay) +;; (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) +;; res)) ;; All good, return res +;; +;; #;(set-functions rmt:send-receive remote-server-url-set! +;; http-transport:close-connections remote-conndat-set! +;; debug:print debug:print-info +;; remote-ro-mode remote-ro-mode-set! +;; remote-ro-mode-checked-set! remote-ro-mode-checked) +;; +;; Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -18,68 +18,1088 @@ ;;====================================================================== (declare (unit rmtmod)) (declare (uses commonmod)) -(declare (uses apimod)) -;; (declare (uses apimod.import)) -(declare (uses ulex)) - -;; (include "ulex/ulex.scm") +(declare (uses clientmod)) +(declare (uses dbmod)) (module rmtmod - * - -(import scheme chicken data-structures extras) -(import (prefix sqlite3 sqlite3:) posix typed-records srfi-18) -(import (prefix commonmod cmod:)) -(import apimod) -(import (prefix ulex ulex:)) - -(defstruct alldat - (areapath #f) - (ulexdat #f) - ) - -;;====================================================================== -;; return the handle struct for sending queries to a specific database -;; - initializes the connection object if this is the first access -;; - finds the "captain" and asks who to talk to for the given dbfname -;; - establishes the connection to the current dbowner -;; -#;(define (rmt:connect alldat dbfname dbtype) - (let* ((ulexdat (or (alldat-ulexdat alldat) - (rmt:setup-ulex alldat)))) - (ulex:connect ulexdat dbfname dbtype))) - -;; setup the remote calls -#;(define (rmt:setup-ulex alldat) - (let* ((udata (ulex:setup))) ;; establish connection to ulex - (alldat-ulexdat-set! alldat udata) - ;; register all needed procs - (ulex:register-handler udata 'ping cmod:get-full-version) ;; override ping with get-full-version - (ulex:register-handler udata 'login cmod:get-full-version) ;; force setup of the connection - (ulex:register-handler udata 'execute api:execute-requests) - udata)) - -;; set up a connection to the current owner of the dbfile associated with rid -;; then send the query to that dbfile owner and wait for a response. -;; -#;(define (rmt:send-receive alldat cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected - (let* (;; (alldat *alldat*) - (areapath (alldat-areapath alldat)) - (dbtype (if (or (not rid)(< rid 1)) ;; this is the criteria for "main.db" - "main" "runs")) - (dbfname (if (equal? dbtype "main") - "main.db" - (conc rid ".db"))) - (dbfile (conc areapath "/.db/" dbfname)) - (ulexconn (rmt:connect alldat dbfname dbtype)) ;; ulexconn is our new *runremote*, it is a dbowner struct < pdat lastrefresh > - (udata (alldat-ulexdat alldat))) - (ulex:remote-request udata ulexconn 'immediate dbfile 'execute rid params))) - ;; need to call this on the other side - ;; (api:execute-requests dbstruct-local (vector (symbol->string cmd) params)))) - - #;(with-input-from-string - (ulex:remote-request udata ulexconn 'immediate dbfile 'execute rid (with-output-to-string (lambda ()(serialize params)))) - (lambda ()(deserialize))) +* + +(import scheme + chicken + data-structures + posix + srfi-1 + srfi-18 + srfi-69 + extras + + clientmod + dbmod + ) + +;; +;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! +;; + +;; generate entries for ~/.megatestrc with the following +;; +;; grep define ../rmt.scm | grep rmt: |perl -pi -e 's/\(define\s+\((\S+)\W.*$/\1/'|sort -u + +;;====================================================================== +;; S U P P O R T F U N C T I O N S +;;====================================================================== + +;; if a server is either running or in the process of starting call client:setup +;; else return #f to let the calling proc know that there is no server available +;; +(define (rmt:get-connection-info areapath) ;; TODO: push areapath down. + (if *runremote* + *runremote* + (begin + (set! *runremote* (client:find-server areapath)) + (con-obj-to-str-set! *runremote* db:obj->str) + (con-host-set! *runremote* (get-host-name)) + (con-pid-set! *runremote* (current-process-id)) + *runremote*))) + + #;(let* ((cinfo (if (remote? runremote) + (remote-conndat runremote) + #f))) + (if cinfo + cinfo + (if (server:check-if-running areapath) + (client:setup areapath runremote) + #f))) + +(define (rmt:on-homehost? runremote) + #t + #;(let* ((hh-dat (remote-hh-dat runremote))) + (if (pair? hh-dat) + (cdr hh-dat) + (begin + (debug:print-info 0 *default-log-port* "hh-dat="hh-dat) + #f)))) + + +;;====================================================================== + +(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id + +(define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected + (let* ((con (rmt:get-connection-info areapath))) + (client:send-receive con cmd params))) + + + +;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) +;; +;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected +;; #;(common:telemetry-log (conc "rmt:"(->string cmd)) +;; payload: `((rid . ,rid) +;; (params . ,params))) +;; +;; (if (> attemptnum 2) +;; (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) +;; +;; (cond +;; ((> attemptnum 2) (thread-sleep! 0.05)) +;; ((> attemptnum 10) (thread-sleep! 0.5)) +;; ((> attemptnum 20) (thread-sleep! 1))) +;; (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) +;; (begin (server:run *toppath*) (thread-sleep! 3))) +;; +;; +;; ;;DOT digraph megatest_state_status { +;; ;;DOT ranksep=0; +;; ;;DOT // rankdir=LR; +;; ;;DOT node [shape="box"]; +;; ;;DOT "rmt:send-receive" -> MUTEXLOCK; +;; ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } +;; ;; do all the prep locked under the rmt-mutex +;; (mutex-lock! *rmt-mutex*) +;; +;; ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote +;; ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. +;; ;; 3. do the query, if on homehost use local access +;; ;; +;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value +;; (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas +;; (runremote (or area-dat +;; *runremote*)) +;; (attemptnum (+ 1 attemptnum)) +;; (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))) +;; +;; ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity +;; ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"]; +;; ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; +;; ;; ensure we have a record for our connection for given area +;; (if (not runremote) ;; can remove this one. should never get here. +;; (begin +;; (set! *runremote* (make-remote)) +;; (let* ((server-info (remote-server-info *runremote*))) +;; (if server-info +;; (begin +;; (remote-server-url-set! *runremote* (server:record->url server-info)) +;; (remote-server-id-set! *runremote* (server:record->id server-info))))) +;; (set! runremote *runremote*))) ;; new runremote will come from this on next iteration +;; +;; ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity +;; ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; +;; ;; DOT SET_HOMEHOST -> MUTEXLOCK; +;; ;; ensure we have a homehost record +;; (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost +;; (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little +;; (let ((hh-data (server:choose-server areapath 'homehost))) +;; (remote-hh-dat-set! runremote (or hh-data (cons #f #f))))) +;; +;; ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) +;; (cond +;; #;((> (- (current-seconds)(remote-connect-time runremote)) 180) ;; reconnect to server every 180 seconds +;; (debug:print 0 *default-log-port* "Forcing reconnect to server(s) due to 180 second timeout.") +;; (set! *runremote* #f) +;; ;; BUG: close-connections should go here? +;; (mutex-unlock! *rmt-mutex*) +;; (rmt:send-receive cmd rid params attemptnum: 1 area-dat: area-dat)) +;; +;; ;;DOT EXIT; +;; ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } +;; ;; give up if more than 150 attempts +;; ((> attemptnum 150) +;; (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.") +;; (exit 1)) +;; +;; ;;DOT CASE2 [label="local\nreadonly\nquery"]; +;; ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} +;; ;;DOT CASE2 -> "rmt:open-qry-close-locally"; +;; ;; readonly mode, read request- handle it - case 2 +;; ((and readonly-mode +;; (member cmd api:read-only-queries)) +;; (mutex-unlock! *rmt-mutex*) +;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") +;; (rmt:open-qry-close-locally cmd 0 params) +;; ) +;; +;; ;;DOT CASE3 [label="write in\nread-only mode"]; +;; ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3} +;; ;;DOT CASE3 -> "#f"; +;; ;; readonly mode, write request. Do nothing, return #f +;; (readonly-mode (extras-readonly-mode *rmt-mutex* *default-log-port* cmd params)) +;; +;; ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. +;; ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. +;; ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) +;; ;; +;; ;;DOT CASE4 [label="reset\nconnection"]; +;; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} +;; ;;DOT CASE4 -> "rmt:send-receive"; +;; ;; reset the connection if it has been unused too long +;; ((and runremote +;; ;; (remote-conndat runremote) +;; (> (current-seconds) ;; if it has been more than server-timeout seconds since last contact, close this connection and start a new on +;; (+ (remote-last-access runremote) +;; (remote-server-timeout runremote)))) +;; (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") +;; (http-transport:close-connections runremote) +;; ;; moving this setting of runremote conndat to #f to inside the http-transport:close-connections +;; ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. +;; (mutex-unlock! *rmt-mutex*) +;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) +;; +;; ;;DOT CASE5 [label="local\nread"]; +;; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; +;; ;;DOT CASE5 -> "rmt:open-qry-close-locally"; +;; +;; ;; on homehost and this is a read +;; ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required +;; (rmt:on-homehost? runremote) +;; (member cmd api:read-only-queries)) ;; this is a read +;; (mutex-unlock! *rmt-mutex*) +;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") +;; (rmt:open-qry-close-locally cmd 0 params)) +;; +;; ;;DOT CASE6 [label="init\nremote"]; +;; ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; +;; ;;DOT CASE6 -> "rmt:send-receive"; +;; ;; on homehost and this is a write, we already have a server, but server has died +;; +;; ;; reinstate this keep-alive section but inject a time condition into the (add ... +;; ;; +;; ;; ((and (cdr (remote-hh-dat runremote)) ;; on homehost +;; ;; (not (member cmd api:read-only-queries)) ;; this is a write +;; ;; (remote-server-url runremote) ;; have a server +;; ;; (not (server:ping (remote-server-url runremote) (remote-server-id runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. +;; ;; (debug:print 0 *default-log-port* "WARNING: server appears to have died, trying to reconnect, case 6") +;; ;; (http-transport:close-connections area-dat: runremote) ;; make sure to clean up +;; ;; (set! *runremote* (make-remote)) +;; ;; (let* ((server-info (remote-server-info *runremote*))) +;; ;; (if server-info +;; ;; (begin +;; ;; (remote-server-url-set! *runremote* (server:record->url server-info)) +;; ;; (remote-server-id-set! *runremote* (server:record->id server-info))))) +;; ;; (remote-force-server-set! runremote (common:force-server?)) +;; ;; (mutex-unlock! *rmt-mutex*) +;; ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") +;; ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) +;; +;; ;;DOT CASE7 [label="homehost\nwrite"]; +;; ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; +;; ;;DOT CASE7 -> "rmt:open-qry-close-locally"; +;; ;; on homehost and this is a write, we already have a server +;; ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required +;; (cdr (remote-hh-dat runremote)) ;; on homehost +;; (not (member cmd api:read-only-queries)) ;; this is a write +;; (remote-server-url runremote)) ;; have a server +;; (mutex-unlock! *rmt-mutex*) +;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") +;; (rmt:open-qry-close-locally cmd 0 params)) +;; +;; ;;DOT CASE8 [label="force\nserver"]; +;; ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8}; +;; ;;DOT CASE8 -> "rmt:open-qry-close-locally"; +;; ;; on homehost, no server contact made and this is a write, passively start a server +;; ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required +;; (cdr (remote-hh-dat runremote)) ;; have homehost +;; (not (remote-server-url runremote)) ;; no connection yet +;; (not (member cmd api:read-only-queries))) ;; not a read-only query +;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") +;; (let ((server-info (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call +;; (if server-info +;; (begin +;; (remote-server-url-set! runremote (server:record->url server-info)) ;; the string can be consumed by the client setup if needed +;; (remote-server-id-set! runremote (server:record->id server-info))) +;; (if (common:force-server?) +;; (server:start-and-wait *toppath*) +;; (server:kind-run *toppath*))) +;; (remote-force-server-set! runremote (common:force-server?)) +;; (mutex-unlock! *rmt-mutex*) +;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") +;; (rmt:open-qry-close-locally cmd 0 params))) +;; +;; ;;DOT CASE9 [label="force server\nnot on homehost"]; +;; ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; +;; ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; +;; ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one +;; (not (remote-conndat runremote))) +;; (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost +;; (not (remote-conndat runremote)))) ;; and no connection +;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) +;; (mutex-unlock! *rmt-mutex*) +;; (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? +;; (server:start-and-wait *toppath*)) +;; (remote-conndat-set! runremote (rmt:get-connection-info *toppath* runremote)) ;; calls client:setup which calls client:setup-http +;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as +;; +;; ;;DOT CASE10 [label="on homehost"]; +;; ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; +;; ;;DOT CASE10 -> "rmt:open-qry-close-locally"; +;; ;; all set up if get this far, dispatch the query +;; ((and (not (remote-force-server runremote)) +;; (cdr (remote-hh-dat runremote))) ;; we are on homehost +;; (mutex-unlock! *rmt-mutex*) +;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10") +;; (rmt:open-qry-close-locally cmd (if rid rid 0) params)) +;; +;; ;;DOT CASE11 [label="send_receive"]; +;; ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; +;; ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; +;; ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; +;; ;; not on homehost, do server query +;; (else (extras-case-11 *default-log-port* runremote cmd params attemptnum rid))))) +;; ;;DOT } +;; +;; ;; bunch of small functions factored out of send-receive to make debug easier +;; ;; +;; +;; (define (extras-case-11 *default-log-port* runremote cmd params attemptnum rid) +;; ;; (mutex-unlock! *rmt-mutex*) +;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") +;; ;; (mutex-lock! *rmt-mutex*) +;; (let* ((conninfo (remote-conndat runremote)) +;; (dat-in (condition-case ;; handling here has +;; ;; caused a lot of +;; ;; problems. However it +;; ;; is needed to deal with +;; ;; attemtped +;; ;; communication to +;; ;; servers that have gone +;; ;; away +;; (http-transport:client-api-send-receive 0 runremote cmd params) +;; ;; (http-transport:client-api-send-receive 0 conninfo cmd params runremote) +;; ((servermismatch) (vector #f "Server id mismatch" )) +;; ((commfail)(vector #f "communications fail")) +;; ((exn)(vector #f "other fail" (print-call-chain))))) +;; (dat (if (and (vector? dat-in) ;; ... check it is a correct size +;; (> (vector-length dat-in) 1)) +;; dat-in +;; (vector #f (conc "communications fail (type 2), dat-in=" dat-in)))) +;; (success (if (vector? dat) (vector-ref dat 0) #f)) +;; (res (if (vector? dat) (vector-ref dat 1) #f))) +;; (if (and (vector? conninfo) (< 5 (vector-length conninfo))) +;; (remote-last-access-set! runremote (current-seconds)) ;; refresh access time +;; (begin +;; (debug:print 0 *default-log-port* "INFO: Should not get here! conninfo=" conninfo) +;; (set! conninfo #f) +;; (http-transport:close-connections runremote))) +;; (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) +;; (mutex-unlock! *rmt-mutex*) +;; (if success ;; success only tells us that the transport was +;; ;; successful, have to examine the data to see if +;; ;; there was a detected issue at the other end +;; (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd) +;; (begin +;; (debug:print-error 0 *default-log-port* " dat=" dat) +;; (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params)) +;; ))) + +(define (rmt:print-db-stats) + (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" + (debug:print 18 *default-log-port* "DB Stats\n========") + (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) + (for-each (lambda (cmd) + (let ((cmd-dat (hash-table-ref *db-stats* cmd))) + (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0)))))) + (sort (hash-table-keys *db-stats*) + (lambda (a b) + (> (vector-ref (hash-table-ref *db-stats* a) 0) + (vector-ref (hash-table-ref *db-stats* b) 0))))))) + +(define (rmt:get-max-query-average run-id) + (mutex-lock! *db-stats-mutex*) + (let* ((runkey (conc "run-id=" run-id " ")) + (cmds (filter (lambda (x) + (substring-index runkey x)) + (hash-table-keys *db-stats*))) + (res (if (null? cmds) + (cons 'none 0) + (let loop ((cmd (car cmds)) + (tal (cdr cmds)) + (max-cmd (car cmds)) + (res 0)) + (let* ((cmd-dat (hash-table-ref *db-stats* cmd)) + (tot (vector-ref cmd-dat 0)) + (curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction + (currmax (max res curravg)) + (newmax-cmd (if (> curravg res) cmd max-cmd))) + (if (null? tal) + (if (> tot 10) + (cons newmax-cmd currmax) + (cons 'none 0)) + (loop (car tal)(cdr tal) newmax-cmd currmax))))))) + (mutex-unlock! *db-stats-mutex*) + res)) + +(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) + (let* ((qry-is-write (not (member cmd api:read-only-queries))) + (db-file-path (db:dbfile-path)) ;; 0)) + (dbstructs-local (db:setup #t)) ;; make-dbr:dbstruct path: dbdir local: #t))) + (read-only (not (file-write-access? db-file-path))) + (start (current-milliseconds)) + (resdat (if (not (and read-only qry-is-write)) + (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params)))) + ;; (handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong.. + ;; exn ;; This is an attempt to detect that situation and recover gracefully + ;; (begin + ;; (debug:print 0 *default-log-port* "ERROR: bad data from server " v " message: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) + ;; (vector #t '())) ;; should always get a vector but if something goes wrong return a dummy + (if (and (vector? v) + (> (vector-length v) 1)) + (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) + newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record + (vector #t '()))) ;; ) ;; we could also check that the returned types are valid + (vector #t '()))) + (success (vector-ref resdat 0)) + (res (vector-ref resdat 1)) + (duration (- (current-milliseconds) start))) + (if (and read-only qry-is-write) + (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd)) + (if (not success) + (if (> remretries 0) + (begin + (debug:print-error 0 *default-log-port* "local query failed. Trying again.") + (thread-sleep! (/ (random 5000) 1000)) ;; some random delay + (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) + (begin + (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up") + #f)) + (begin + ;; (rmt:update-db-stats run-id cmd params duration) + ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it + (if qry-is-write + (let ((start-time (current-seconds))) + (mutex-lock! *db-multi-sync-mutex*) +/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) + (mutex-unlock! *db-multi-sync-mutex*))))) + res)) + +(define (rmt:send-receive-no-auto-client-setup runremote cmd run-id params) + (let* ((run-id (if run-id run-id 0)) + (res (http-transport:client-api-send-receive run-id runremote cmd params))) + (if (and res (vector-ref res 0)) + (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! + #f))) + +;;====================================================================== +;; +;; A C T U A L A P I C A L L S +;; +;;====================================================================== + +;;====================================================================== +;; S E R V E R +;;====================================================================== + +(define (rmt:kill-server run-id) + (rmt:send-receive 'kill-server run-id (list run-id))) + +(define (rmt:start-server run-id) + (rmt:send-receive 'start-server 0 (list run-id))) + +;;====================================================================== +;; M I S C +;;====================================================================== + +(define (rmt:login run-id) + (rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature)))) + +;; This login does no retries under the hood - it acts a bit like a ping. +;; Deprecated for nmsg-transport. +;; +(define (rmt:login-no-auto-client-setup runremote) + (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature)))) + +;; hand off a call to one of the db:queries statements +;; added run-id to make looking up the correct db possible +;; +(define (rmt:general-call stmtname run-id . params) + (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params))) + + +;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host +(define (rmt:get-latest-host-load hostname) + (rmt:send-receive 'get-latest-host-load 0 (list hostname))) + +(define (rmt:sdb-qry qry val run-id) + ;; add caching if qry is 'getid or 'getstr + (rmt:send-receive 'sdb-qry run-id (list qry val))) + +;; NOT COMPLETED +(define (rmt:runtests user run-id testpatt params) + (rmt:send-receive 'runtests run-id testpatt)) + +(define (rmt:get-run-record-ids target run keynames test-patt) + (rmt:send-receive 'get-run-record-ids #f (list target run keynames test-patt))) + +(define (rmt:get-changed-record-ids since-time) + (rmt:send-receive 'get-changed-record-ids #f (list since-time)) ) + +(define (rmt:drop-all-triggers) + (rmt:send-receive 'drop-all-triggers #f '())) + +(define (rmt:create-all-triggers) + (rmt:send-receive 'create-all-triggers #f '())) + +;;====================================================================== +;; T E S T M E T A +;;====================================================================== + +(define (rmt:get-tests-tags) + (rmt:send-receive 'get-tests-tags #f '())) + +;;====================================================================== +;; K E Y S +;;====================================================================== + +;; These require run-id because the values come from the run! +;; +(define (rmt:get-key-val-pairs run-id) + (rmt:send-receive 'get-key-val-pairs run-id (list run-id))) + +(define (rmt:get-keys) + (if *db-keys* *db-keys* + (let ((res (rmt:send-receive 'get-keys #f '()))) + (set! *db-keys* res) + res))) + +(define (rmt:get-keys-write) ;; dummy query to force server start + (let ((res (rmt:send-receive 'get-keys-write #f '()))) + (set! *db-keys* res) + res)) + +;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe +;; to cache the resuls in a hash +;; +(define (rmt:get-key-vals run-id) + (or (hash-table-ref/default *keyvals* run-id #f) + (let ((res (rmt:send-receive 'get-key-vals #f (list run-id)))) + (hash-table-set! *keyvals* run-id res) + res))) + +(define (rmt:get-targets) + (rmt:send-receive 'get-targets #f '())) + +(define (rmt:get-target run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-target run-id (list run-id))) + +(define (rmt:get-run-times runpatt targetpatt) + (rmt:send-receive 'get-run-times #f (list runpatt targetpatt ))) + + +;;====================================================================== +;; T E S T S +;;====================================================================== + +;; Just some syntatic sugar +(define (rmt:register-test run-id test-name item-path) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:general-call 'register-test run-id run-id test-name item-path)) + +(define (rmt:get-test-id run-id testname item-path) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) + +;; run-id is NOT used +;; +(define (rmt:get-test-info-by-id run-id test-id) + (if (number? test-id) + (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) + (begin + (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) + (print-call-chain (current-error-port)) + #f))) + +(define (rmt:test-get-rundir-from-test-id run-id test-id) + (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) + +(define (rmt:open-test-db-by-test-id run-id test-id #!key (work-area #f)) + (assert (number? run-id) "FATAL: Run id required.") + (let* ((test-path (if (string? work-area) + work-area + (rmt:test-get-rundir-from-test-id run-id test-id)))) + (debug:print 3 *default-log-port* "TEST PATH: " test-path) + (open-test-db test-path))) + +;; WARNING: This currently bypasses the transaction wrapped writes system +(define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-set-state-status-by-id run-id (list run-id test-id newstate newstatus newcomment))) + +(define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus))) + +(define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) + (assert (number? run-id) "FATAL: Run id required.") + ;; (if (number? run-id) + (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode))) + ;; (begin + ;; (debug:print-error 0 *default-log-port* "rmt:get-tests-for-run called with bad run-id=" run-id) + ;; (print-call-chain (current-error-port)) + ;; '()))) + +(define (rmt:get-tests-for-run-state-status run-id testpatt last-update) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-tests-for-run-state-status run-id (list run-id testpatt last-update))) + +;; get stuff via synchash +(define (rmt:synchash-get run-id proc synckey keynum params) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'synchash-get run-id (list run-id proc synckey keynum params))) + +(define (rmt:get-tests-for-run-mindata run-id testpatt states status not-in) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-id testpatt states status not-in))) + +;; IDEA: Threadify these - they spend a lot of time waiting ... +;; +(define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) + (let ((multi-run-mutex (make-mutex)) + (run-id-list (if run-ids + run-ids + (rmt:get-all-run-ids))) + (result '())) + (if (null? run-id-list) + '() + (let loop ((hed (car run-id-list)) + (tal (cdr run-id-list)) + (threads '())) + (if (> (length threads) 5) + (loop hed tal (filter (lambda (th)(not (member (thread-state th) '(terminated dead)))) threads)) + (let* ((newthread (make-thread + (lambda () + (let ((res (rmt:send-receive 'get-tests-for-run-mindata hed (list hed testpatt states status not-in)))) + (if (list? res) + (begin + (mutex-lock! multi-run-mutex) + (set! result (append result res)) + (mutex-unlock! multi-run-mutex)) + (debug:print-error 0 *default-log-port* "get-tests-for-run-mindata failed for run-id " hed ", testpatt " testpatt ", states " states ", status " status ", not-in " not-in)))) + (conc "multi-run-thread for run-id " hed))) + (newthreads (cons newthread threads))) + (thread-start! newthread) + (thread-sleep! 0.05) ;; give that thread some time to start + (if (null? tal) + newthreads + (loop (car tal)(cdr tal) newthreads)))))) + result)) + +;; ;; IDEA: Threadify these - they spend a lot of time waiting ... +;; ;; +;; (define (rmt:get-tests-for-runs-mindata run-ids testpatt states status not-in) +;; (let ((run-id-list (if run-ids +;; run-ids +;; (rmt:get-all-run-ids)))) +;; (apply append (map (lambda (run-id) +;; (rmt:send-receive 'get-tests-for-run-mindata run-id (list run-ids testpatt states status not-in))) +;; run-id-list)))) + +(define (rmt:delete-test-records run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'delete-test-records run-id (list run-id test-id))) + +(define (rmt:test-set-state-status run-id test-id state status msg) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-set-state-status run-id (list run-id test-id state status msg))) + +(define (rmt:test-toplevel-num-items run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-toplevel-num-items run-id (list run-id test-name))) + +;; (define (rmt:get-previous-test-run-record run-id test-name item-path) +;; (rmt:send-receive 'get-previous-test-run-record run-id (list run-id test-name item-path))) + +(define (rmt:get-matching-previous-test-run-records run-id test-name item-path) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-matching-previous-test-run-records run-id (list run-id test-name item-path))) + +(define (rmt:test-get-logfile-info run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-get-logfile-info run-id (list run-id test-name))) + +(define (rmt:test-get-records-for-index-file run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-get-records-for-index-file run-id (list run-id test-name))) + +(define (rmt:get-testinfo-state-status run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-testinfo-state-status run-id (list run-id test-id))) + +(define (rmt:test-set-log! run-id test-id logf) + (assert (number? run-id) "FATAL: Run id required.") + (if (string? logf)(rmt:general-call 'test-set-log run-id logf test-id))) + +(define (rmt:test-set-top-process-pid run-id test-id pid) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-set-top-process-pid run-id (list run-id test-id pid))) + +(define (rmt:test-get-top-process-pid run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-get-top-process-pid run-id (list run-id test-id))) + +(define (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt) + (rmt:send-receive 'get-run-ids-matching-target #f (list keynames target res runname testpatt statepatt statuspatt))) + +;; NOTE: This will open and access ALL run databases. +;; +(define (rmt:test-get-paths-matching-keynames-target-new keynames target res testpatt statepatt statuspatt runname) + (let ((run-ids (rmt:get-run-ids-matching-target keynames target res runname testpatt statepatt statuspatt))) + (apply append + (map (lambda (run-id) + (rmt:send-receive 'test-get-paths-matching-keynames-target-new run-id (list run-id keynames target res testpatt statepatt statuspatt runname))) + run-ids)))) + +(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) + +(define (rmt:get-count-tests-running-for-run-id run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) + +(define (rmt:get-not-completed-cnt run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) + + +;; Statistical queries + +(define (rmt:get-count-tests-running run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-count-tests-running run-id (list run-id))) + +(define (rmt:get-count-tests-running-for-testname run-id testname) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname))) + +(define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-count-tests-running-in-jobgroup run-id (list run-id jobgroup))) + +;; state and status are extra hints not usually used in the calculation +;; +(define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment))) + +(define (rmt:set-state-status-and-roll-up-run run-id state status) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'set-state-status-and-roll-up-run run-id (list run-id state status))) + + +(define (rmt:update-pass-fail-counts run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:general-call 'update-pass-fail-counts run-id test-name test-name test-name)) + +(define (rmt:top-test-set-per-pf-counts run-id test-name) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'top-test-set-per-pf-counts run-id (list run-id test-name))) + +(define (rmt:get-raw-run-stats run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-raw-run-stats run-id (list run-id))) + +(define (rmt:get-test-times runname target) + (rmt:send-receive 'get-test-times #f (list runname target ))) + +;;====================================================================== +;; R U N S +;;====================================================================== + +;; BUG - LOOK AT HOW THIS WORKS!!! +;; +(define (rmt:get-run-info run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-info #f (list run-id))) + +(define (rmt:get-num-runs runpatt) + (rmt:send-receive 'get-num-runs #f (list runpatt))) + +(define (rmt:get-runs-cnt-by-patt runpatt targetpatt keys) + (rmt:send-receive 'get-runs-cnt-by-patt #f (list runpatt targetpatt keys))) + +;; Use the special run-id == #f scenario here since there is no run yet +(define (rmt:register-run keyvals runname state status user contour) + (rmt:send-receive 'register-run #f (list keyvals runname state status user contour))) + +(define (rmt:get-run-name-from-id run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-name-from-id #f (list run-id))) + +(define (rmt:delete-run run-id) + (rmt:send-receive 'delete-run #f (list run-id))) + +(define (rmt:update-run-stats run-id stats) + (rmt:send-receive 'update-run-stats #f (list run-id stats))) + +(define (rmt:delete-old-deleted-test-records) + (rmt:send-receive 'delete-old-deleted-test-records #f '())) + +(define (rmt:get-runs runpatt count offset keypatts) + (rmt:send-receive 'get-runs #f (list runpatt count offset keypatts))) + +(define (rmt:simple-get-runs runpatt count offset target last-update) + (rmt:send-receive 'simple-get-runs #f (list runpatt count offset target last-update))) + +(define (rmt:get-all-run-ids) + (rmt:send-receive 'get-all-run-ids #f '())) + +(define (rmt:get-prev-run-ids run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-prev-run-ids #f (list run-id))) + +(define (rmt:lock/unlock-run run-id lock unlock user) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'lock/unlock-run #f (list run-id lock unlock user))) + +;; set/get status +(define (rmt:get-run-status run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-status #f (list run-id))) + +(define (rmt:get-run-state run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-run-state #f (list run-id))) + + +(define (rmt:set-run-status run-id run-status #!key (msg #f)) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'set-run-status #f (list run-id run-status msg))) + +(define (rmt:set-run-state-status run-id state status ) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'set-run-state-status #f (list run-id state status))) + +(define (rmt:update-tesdata-on-repilcate-db old-lt new-lt) +(rmt:send-receive 'update-tesdata-on-repilcate-db #f (list old-lt new-lt))) + +(define (rmt:update-run-event_time run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'update-run-event_time #f (list run-id))) + +(define (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update #!key (sort-order "asc")) ;; fields of #f uses default + (rmt:send-receive 'get-runs-by-patt #f (list keys runnamepatt targpatt offset limit fields last-runs-update sort-order))) + +(define (rmt:find-and-mark-incomplete run-id ovr-deadtime) + (assert (number? run-id) "FATAL: Run id required.") + ;; (if (rmt:send-receive 'have-incompletes? run-id (list run-id ovr-deadtime)) + (rmt:send-receive 'mark-incomplete run-id (list run-id ovr-deadtime))) ;; ) + +(define (rmt:get-main-run-stats run-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-main-run-stats #f (list run-id))) + +(define (rmt:get-var varname) + (rmt:send-receive 'get-var #f (list varname))) + +(define (rmt:del-var varname) + (rmt:send-receive 'del-var #f (list varname))) + +(define (rmt:set-var varname value) + (rmt:send-receive 'set-var #f (list varname value))) + +(define (rmt:inc-var varname) + (rmt:send-receive 'inc-var #f (list varname))) + +(define (rmt:dec-var varname) + (rmt:send-receive 'dec-var #f (list varname))) + +(define (rmt:add-var varname value) + (rmt:send-receive 'add-var #f (list varname value))) + +;;====================================================================== +;; M U L T I R U N Q U E R I E S +;;====================================================================== + +;; Need to move this to multi-run section and make associated changes +(define (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f)) + (let ((run-ids (rmt:get-all-run-ids))) + (for-each (lambda (run-id) + (rmt:find-and-mark-incomplete run-id ovr-deadtime)) + run-ids))) + +;; get the previous record for when this test was run where all keys match but runname +;; returns #f if no such test found, returns a single test record if found +;; +;; Run this at the client end since we have to connect to multiple run-id dbs +;; +(define (rmt:get-previous-test-run-record run-id test-name item-path) + (let* ((keyvals (rmt:get-key-val-pairs run-id)) + (keys (rmt:get-keys)) + (selstr (string-intersperse keys ",")) + (qrystr (string-intersperse (map (lambda (x)(conc x "=?")) keys) " AND "))) + (if (not keyvals) + #f + (let ((prev-run-ids (rmt:get-prev-run-ids run-id))) + ;; for each run starting with the most recent look to see if there is a matching test + ;; if found then return that matching test record + (debug:print 4 *default-log-port* "selstr: " selstr ", qrystr: " qrystr ", keyvals: " keyvals ", previous run ids found: " prev-run-ids) + (if (null? prev-run-ids) #f + (let loop ((hed (car prev-run-ids)) + (tal (cdr prev-run-ids))) + (let ((results (rmt:get-tests-for-run hed (conc test-name "/" item-path) '() '() ;; run-id testpatt states statuses + #f #f #f ;; offset limit not-in hide/not-hide + #f #f #f #f 'normal))) ;; sort-by sort-order qryvals last-update mode + (debug:print 4 *default-log-port* "Got tests for run-id " run-id ", test-name " test-name ", item-path " item-path ": " results) + (if (and (null? results) + (not (null? tal))) + (loop (car tal)(cdr tal)) + (if (null? results) #f + (car results)))))))))) + +(define (rmt:get-run-stats) + (rmt:send-receive 'get-run-stats #f '())) + +;;====================================================================== +;; S T E P S +;;====================================================================== + +;; Getting steps is more complicated. +;; +;; If given work area +;; 1. Find the testdat.db file +;; 2. Open the testdat.db file and do the query +;; If not given the work area +;; 1. Do a remote call to get the test path +;; 2. Continue as above +;; +;;(define (rmt:get-steps-for-test run-id test-id) +;; (rmt:send-receive 'get-steps-data run-id (list test-id))) + +(define (rmt:teststep-set-status! run-id test-id teststep-name state-in status-in comment logfile) + (assert (number? run-id) "FATAL: Run id required.") + (let* ((state (items:check-valid-items "state" state-in)) + (status (items:check-valid-items "status" status-in))) + (if (or (not state)(not status)) + (debug:print 3 *default-log-port* "WARNING: Invalid " (if status "status" "state") + " value \"" (if status state-in status-in) "\", update your validvalues section in megatest.config")) + (rmt:send-receive 'teststep-set-status! run-id (list run-id test-id teststep-name state-in status-in comment logfile)))) + + +(define (rmt:delete-steps-for-test! run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'delete-steps-for-test! run-id (list run-id test-id))) + +(define (rmt:get-steps-for-test run-id test-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-steps-for-test run-id (list run-id test-id))) + +(define (rmt:get-steps-info-by-id run-id test-step-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-steps-info-by-id #f (list run-id test-step-id))) + +;;====================================================================== +;; T E S T D A T A +;;====================================================================== + +(define (rmt:read-test-data run-id test-id categorypatt #!key (work-area #f)) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'read-test-data run-id (list run-id test-id categorypatt))) + +(define (rmt:read-test-data-varpatt run-id test-id categorypatt varpatt #!key (work-area #f)) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'read-test-data-varpatt run-id (list run-id test-id categorypatt varpatt))) + +(define (rmt:get-data-info-by-id run-id test-data-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'get-data-info-by-id #f (list run-id test-data-id))) + +(define (rmt:testmeta-add-record testname) + (rmt:send-receive 'testmeta-add-record #f (list testname))) + +(define (rmt:testmeta-get-record testname) + (rmt:send-receive 'testmeta-get-record #f (list testname))) + +(define (rmt:testmeta-update-field test-name fld val) + (rmt:send-receive 'testmeta-update-field #f (list test-name fld val))) + +(define (rmt:test-data-rollup run-id test-id status) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-data-rollup run-id (list run-id test-id status))) + +(define (rmt:csv->test-data run-id test-id csvdata) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'csv->test-data run-id (list run-id test-id csvdata))) + +;;====================================================================== +;; T A S K S +;;====================================================================== + +(define (rmt:tasks-find-task-queue-records target run-name test-patt state-patt action-patt) + (rmt:send-receive 'find-task-queue-records #f (list target run-name test-patt state-patt action-patt))) + +(define (rmt:tasks-add action owner target runname testpatt params) + (rmt:send-receive 'tasks-add #f (list action owner target runname testpatt params))) + +(define (rmt:tasks-set-state-given-param-key param-key new-state) + (rmt:send-receive 'tasks-set-state-given-param-key #f (list param-key new-state))) + +(define (rmt:tasks-get-last target runname) + (rmt:send-receive 'tasks-get-last #f (list target runname))) + +;;====================================================================== +;; N O S Y N C D B +;;====================================================================== + +(define (rmt:no-sync-set var val) + (rmt:send-receive 'no-sync-set #f `(,var ,val))) + +(define (rmt:no-sync-get/default var default) + (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) + +(define (rmt:no-sync-del! var) + (rmt:send-receive 'no-sync-del! #f `(,var))) + +(define (rmt:no-sync-get-lock keyname) + (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) + +;;====================================================================== +;; A R C H I V E S +;;====================================================================== + +(define (rmt:archive-get-allocations testname itempath dneeded) + (rmt:send-receive 'archive-get-allocations #f (list testname itempath dneeded))) + +(define (rmt:archive-register-block-name bdisk-id archive-path) + (rmt:send-receive 'archive-register-block-name #f (list bdisk-id archive-path))) + +(define (rmt:archive-allocate-testsuite/area-to-block block-id testsuite-name areakey) + (rmt:send-receive 'archive-allocate-test-to-block #f (list block-id testsuite-name areakey))) + +(define (rmt:archive-register-disk bdisk-name bdisk-path df) + (rmt:send-receive 'archive-register-disk #f (list bdisk-name bdisk-path df))) + +(define (rmt:test-set-archive-block-id run-id test-id archive-block-id) + (assert (number? run-id) "FATAL: Run id required.") + (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id))) + +(define (rmt:test-get-archive-block-info archive-block-id) + (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id))) + + +(define (rmtmod:calc-ro-mode runremote *toppath*) + (if (and runremote + (remote-ro-mode-checked runremote)) + (remote-ro-mode runremote) + (let* ((mtcfgfile (conc *toppath* "/megatest.config")) + (ro-mode (not (file-write-access? mtcfgfile)))) ;; TODO: use dbstruct or runremote to figure this out in future + (if runremote + (begin + (remote-ro-mode-set! runremote ro-mode) + (remote-ro-mode-checked-set! runremote #t) + ro-mode) + ro-mode)))) + +(define (extras-readonly-mode rmt-mutex log-port cmd params) + (mutex-unlock! rmt-mutex) + (debug:print-info 12 log-port "rmt:send-receive, case 3") + (debug:print 0 log-port "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params) + #f) + +(define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params) + (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) + (mutex-lock! *rmt-mutex*) + (http-transport:close-connections runremote) + (remote-server-url-set! runremote #f) + (mutex-unlock! *rmt-mutex*) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) + +(define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd) + (if (and (vector? res) + (eq? (vector-length res) 2) + (eq? (vector-ref res 1) 'overloaded)) ;; since we are + ;; looking at the + ;; data to carry the + ;; error we'll use a + ;; fairly obtuse + ;; combo to minimise + ;; the chances of + ;; some sort of + ;; collision. this + ;; is the case where + ;; the returned data + ;; is bad or the + ;; server is + ;; overloaded and we + ;; want to ease off + ;; the queries + (let ((wait-delay (+ attemptnum (* attemptnum 10)))) + (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") + (mutex-lock! *rmt-mutex*) + (http-transport:close-connections runremote) + (set! *runremote* #f) ;; force starting over + (mutex-unlock! *rmt-mutex*) + (thread-sleep! wait-delay) + (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) + res)) ;; All good, return res + +#;(set-functions rmt:send-receive remote-server-url-set! + http-transport:close-connections remote-conndat-set! + debug:print debug:print-info + remote-ro-mode remote-ro-mode-set! + remote-ro-mode-checked-set! remote-ro-mode-checked) + + ) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -25,11 +25,11 @@ (declare (uses db)) (declare (uses common)) (declare (uses items)) (declare (uses runconfig)) (declare (uses tests)) -(declare (uses server)) +(declare (uses servermod)) (declare (uses mt)) (declare (uses archive)) ;; (declare (uses filedb)) (include "common_records.scm") Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -14,857 +14,858 @@ ;; ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see . ;; -(require-extension (srfi 18) extras tcp s11n) - -(use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest - directory-utils posix-extras matchable utils) - -(use spiffy uri-common intarweb http-client spiffy-request-vars) - -(declare (unit server)) - -(declare (uses commonmod)) - -(declare (uses common)) -(declare (uses db)) -(declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. -;; (declare (uses synchash)) -(declare (uses http-transport)) -;;(declare (uses rpc-transport)) -(declare (uses launch)) -;; (declare (uses daemon)) - -(import commonmod) - -(include "common_records.scm") -(include "db_records.scm") - -(define (server:make-server-url hostport) - (if (not hostport) - #f - (conc "http://" (car hostport) ":" (cadr hostport)))) - -(define *server-loop-heart-beat* (current-seconds)) - -;;====================================================================== -;; P K T S S T U F F -;;====================================================================== - -;; ??? - -;;====================================================================== -;; P K T S S T U F F -;;====================================================================== - -;; ??? - -;;====================================================================== -;; S E R V E R -;;====================================================================== - -;; Call this to start the actual server -;; - -;;====================================================================== -;; S E R V E R U T I L I T I E S -;;====================================================================== - -;; Get the transport -(define (server:get-transport) - (if *transport-type* - *transport-type* - (let ((ttype (string->symbol - (or (args:get-arg "-transport") - (configf:lookup *configdat* "server" "transport") - "rpc")))) - (set! *transport-type* ttype) - ttype))) - -;; Generate a unique signature for this server -(define (server:mk-signature) - (message-digest-string (md5-primitive) - (with-output-to-string - (lambda () - (write (list (current-directory) - (current-process-id) - (argv))))))) - -(define (server:get-client-signature) - (if *my-client-signature* *my-client-signature* - (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic - (set! *my-client-signature* sig) - *my-client-signature*))) - -(define (server:get-server-id) - (if *server-id* *server-id* - (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic - (set! *server-id* sig) - *server-id*))) - -;; When using zmq this would send the message back (two step process) -;; with spiffy or rpc this simply returns the return data to be returned -;; -(define (server:reply return-addr query-sig success/fail result) - (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) - ;; (send-message pubsock target send-more: #t) - ;; (send-message pubsock - (case (server:get-transport) - ((rpc) (db:obj->string (vector success/fail query-sig result))) - ((http) (db:obj->string (vector success/fail query-sig result))) - ((fs) result) - (else - (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) - result))) - -;; Given an area path, start a server process ### NOTE ### > file 2>&1 -;; if the target-host is set -;; try running on that host -;; incidental: rotate logs in logs/ dir. -;; -(define (server:run areapath) ;; areapath is *toppath* for a given testsuite area - (let* ((testsuite (common:get-testsuite-name)) - (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) - (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") - "")) - (cmdln (conc (common:get-megatest-exe) - " -server - ";; (or target-host "-") - (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") - " -daemonize " - "") - ;; " -log " logfile - " -m testsuite:" testsuite - " " profile-mode - )) ;; (conc " >> " logfile " 2>&1 &"))))) - (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? - (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) - ;; we want the remote server to start in *toppath* so push there - (push-directory areapath) - (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") - (thread-start! log-rotate) - - ;; host.domain.tld match host? - ;; (if (and target-host - ;; ;; look at target host, is it host.domain.tld or ip address and does it - ;; ;; match current ip or hostname - ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) - ;; (not (equal? curr-ip target-host))) - ;; (begin - ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) - ;; (setenv "TARGETHOST" target-host))) - ;; - (setenv "TARGETHOST_LOGF" logfile) - (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time - (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) - (system (conc "nbfake " cmdln)) - (unsetenv "TARGETHOST_LOGF") - ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) - (thread-join! log-rotate) - (pop-directory))) - -;; given a path to a server log return: host port startseconds server-id -;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let -;; example of what it's looking for in the log file: -;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 - -(define (server:logf-get-start-info logf) - (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id - (dbprep-rx (regexp "^SERVER: dbprep")) - (dbprep-found 0) - (bad-dat (list #f #f #f #f #f))) - (handle-exceptions - exn - (begin - ;; WARNING: this is potentially dangerous to blanket ignore the errors - (if (file-exists? logf) - (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) - bad-dat) ;; no idea what went wrong, call it a bad server - (with-input-from-file - logf - (lambda () - (let loop ((inl (read-line)) - (lnum 0)) - (if (not (eof-object? inl)) - (let ((mlst (string-match server-rx inl)) - (dbprep (string-match dbprep-rx inl))) - (if dbprep (set! dbprep-found 1)) - (if (not mlst) - (if (< lnum 500) ;; give up if more than 500 lines of server log read - (loop (read-line)(+ lnum 1)) - (begin - (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) - bad-dat)) - (match mlst - ((_ host port start server-id pid) - (list host - (string->number port) - (string->number start) - server-id - (string->number pid))) - (else - (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) - bad-dat)))) - (begin - (if dbprep-found - (begin - (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) - (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? - (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) - bad-dat)))))))) - -;; ;; get a list of servers from the log files, with all relevant data -;; ;; ( mod-time host port start-time pid ) -;; ;; -;; (define (server:get-list areapath #!key (limit #f)) -;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) -;; (day-seconds (* 24 60 60))) -;; ;; if the directory exists continue to get the list -;; ;; otherwise attempt to create the logs dir and then -;; ;; continue -;; (if (if (directory-exists? (conc areapath "/logs")) -;; '() -;; (if (file-write-access? areapath) -;; (begin -;; (condition-case -;; (create-directory (conc areapath "/logs") #t) -;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) -;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) -;; (directory-exists? (conc areapath "/logs"))) -;; '())) -;; -;; ;; Get the list of server logs. -;; (let* ( -;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. -;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) -;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) -;; (num-serv-logs (length server-logs))) -;; (if (or (null? server-logs) (= num-serv-logs 0)) -;; (let () -;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) -;; '() -;; ) -;; (let loop ((hed (string-chomp (car server-logs))) -;; (tal (cdr server-logs)) -;; (res '())) -;; (let* ((mod-time (handle-exceptions -;; exn -;; (begin -;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) -;; (current-seconds)) ;; 0 -;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted -;; (down-time (- (current-seconds) mod-time)) -;; (serv-dat (if (or (< num-serv-logs 10) -;; (< down-time 900)) ;; day-seconds)) -;; (server:logf-get-start-info hed) -;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at -;; (serv-rec (cons mod-time serv-dat)) -;; (fmatch (string-match fname-rx hed)) -;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) -;; (new-res (if (null? serv-dat) -;; res -;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let -;; (if (null? tal) -;; (if (and limit -;; (> (length new-res) limit)) -;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work -;; new-res) -;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) - -#;(define (server:get-num-alive srvlst) - (let ((num-alive 0)) - (for-each - (lambda (server) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) - (match-let (((mod-time host port start-time server-id pid) - server)) - (let* ((uptime (- (current-seconds) mod-time)) - (runtime (if start-time - (- mod-time start-time) - 0))) - (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) - srvlst) - num-alive)) - -;; ;; given a list of servers get a list of valid servers, i.e. at least -;; ;; 10 seconds old, has started and is less than 1 hour old and is -;; ;; active (i.e. mod-time < 10 seconds -;; ;; -;; ;; mod-time host port start-time pid -;; ;; -;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off -;; ;; and servers should stick around for about two hours or so. -;; ;; -;; (define (server:get-best srvlst) -;; (let* ((nums (server:get-num-servers)) -;; (now (current-seconds)) -;; (slst (sort -;; (filter (lambda (rec) -;; (if (and (list? rec) -;; (> (length rec) 2)) -;; (let ((start-time (list-ref rec 3)) -;; (mod-time (list-ref rec 0))) -;; ;; (print "start-time: " start-time " mod-time: " mod-time) -;; (and start-time mod-time -;; (> (- now start-time) 0) ;; been running at least 0 seconds -;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds -;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set -;; (< (- now start-time) -;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) -;; 180) -;; (random 360)))) ;; under one hour running time +/- 180 -;; )) -;; #f)) -;; srvlst) -;; (lambda (a b) -;; (< (list-ref a 3) -;; (list-ref b 3)))))) -;; (if (> (length slst) nums) -;; (take slst nums) -;; slst))) - -;; ;; switch from server:get-list to server:get-servers-info -;; ;; -;; (define (server:get-first-best areapath) -;; (let ((srvrs (server:get-best (server:get-list areapath)))) -;; (if (and srvrs -;; (not (null? srvrs))) -;; (car srvrs) -;; #f))) -;; -;; (define (server:get-rand-best areapath) -;; (let ((srvrs (server:get-best (server:get-list areapath)))) -;; (if (and (list? srvrs) -;; (not (null? srvrs))) -;; (let* ((len (length srvrs)) -;; (idx (random len))) -;; (list-ref srvrs idx)) -;; #f))) - -(define (server:record->id servr) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) - #f) - (match-let (((host port start-time server-id pid) - servr)) - (if server-id - server-id - #f)))) - -(define (server:record->url servr) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) - #f) - (match-let (((host port start-time server-id pid) - servr)) - (if (and host port) - (conc host ":" port) - #f)))) - - -;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. -;; if it is old enough, overwrite it and wait 0.25 seconds. -;; if it then has the wrong server key, wait + 1 and call this function recursively. -;; -#;(define (server:wait-for-server-start-last-flag areapath) - (let* ((start-flag (conc areapath "/logs/server-start-last")) - ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) - (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) - (server-key (conc (get-host-name) "-" (current-process-id)))) - (if (file-exists? start-flag) - (let* ((fmodtime (file-modification-time start-flag)) - (delta (- (current-seconds) fmodtime)) - (old-enough (> delta idletime)) - (new-server-key "")) - ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. - ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. - (if (and old-enough - (begin - (debug:print-info 2 *default-log-port* "Writing " start-flag) - (with-output-to-file start-flag (lambda () (print server-key))) - (thread-sleep! 0.25) - (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) - (equal? server-key new-server-key))) - #t - ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. - (begin - (debug:print-info 0 *default-log-port* "Gating server start, last start: " - (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) - - (thread-sleep! ( + 1 idletime)) - (server:wait-for-server-start-last-flag areapath))))))) - -;; oldest server alive determines host then choose random of youngest -;; five servers on that host -;; -(define (server:get-servers-info areapath) - ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") - (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) - (if (not (file-exists? servinfodir)) - (create-directory servinfodir)) - (let* ((allfiles (glob (conc servinfodir"/*"))) - (res (make-hash-table))) - (for-each - (lambda (f) - (let* ((hostport (pathname-strip-directory f)) - (serverdat (server:logf-get-start-info f))) - (match serverdat - ((host port start server-id pid) - (if (and host port start server-id pid) - (hash-table-set! res hostport serverdat) - (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) - (else - (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) - allfiles) - res))) - -;; check the .servinfo directory, are there other servers running on this -;; or another host? -;; -;; returns #t => ok to start another server -;; #f => not ok to start another server -;; -(define (server:minimal-check areapath) - (server:clean-up-old areapath) - (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) - (servrs (glob (conc srvdir"/*"))) - (thishostip (server:get-best-guess-address (get-host-name))) - (thisservrs (glob (conc srvdir"/"thishostip":*"))) - (homehostinf (server:choose-server areapath 'homehost)) - (havehome (car homehostinf)) - (wearehome (cdr homehostinf))) - (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome - ", numservers: "(length thisservrs)) - (cond - ((not havehome) #t) ;; no homehost yet, go for it - ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another - ((and havehome (not wearehome)) #f) ;; we are not the home host - ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running - (else - (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) - #t)))) - - -(define server-last-start 0) - - -;; oldest server alive determines host then choose random of youngest -;; five servers on that host -;; -;; mode: -;; best - get best server (random of newest five) -;; home - get home host based on oldest server -;; info - print info -(define (server:choose-server areapath #!optional (mode 'best)) - ;; age is current-starttime - ;; find oldest alive - ;; 1. sort by age ascending and ping until good - ;; find alive rand from youngest - ;; 1. sort by age descending - ;; 2. take five - ;; 3. check alive, discard if not and repeat - ;; first we clean up old server files - (server:clean-up-old areapath) - (let* ((since-last (- (current-seconds) server-last-start)) - (server-start-delay 10)) - (if ( < (- (current-seconds) server-last-start) 10 ) - (begin - (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) - (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") - (thread-sleep! server-start-delay) - ) - (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) - ) - ) - (let* ((serversdat (server:get-servers-info areapath)) - (servkeys (hash-table-keys serversdat)) - (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last - (sort servkeys ;; list of "host:port" - (lambda (a b) - (>= (list-ref (hash-table-ref serversdat a) 2) - (list-ref (hash-table-ref serversdat b) 2)))) - '()))) - (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) - (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) - (if (not (null? by-time-asc)) - (let* ((oldest (last by-time-asc)) - (oldest-dat (hash-table-ref serversdat oldest)) - (host (list-ref oldest-dat 0)) - (all-valid (filter (lambda (x) - (equal? host (list-ref (hash-table-ref serversdat x) 0))) - by-time-asc)) - (best-ten (lambda () - (if (> (length all-valid) 11) - (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out - (if (> (length all-valid) 8) - (drop-right all-valid 1) - all-valid)))) - (names->dats (lambda (names) - (map (lambda (x) - (hash-table-ref serversdat x)) - names))) - (am-home? (lambda () - (let* ((currhost (get-host-name)) - (bestadrs (server:get-best-guess-address currhost))) - (or (equal? host currhost) - (equal? host bestadrs)))))) - (case mode - ((info) - (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) - (print "youngest: "(hash-table-ref serversdat (car all-valid)))) - ((home) host) - ((homehost) (cons host (am-home?))) ;; shut up old code - ((home?) (am-home?)) - ((best-ten)(names->dats (best-ten))) - ((all-valid)(names->dats all-valid)) - ((best) (let* ((best-ten (best-ten)) - (len (length best-ten))) - (hash-table-ref serversdat (list-ref best-ten (random len))))) - ((count)(length all-valid)) - (else - (debug:print 0 *default-log-port* "ERROR: invalid command "mode) - #f))) - (begin - (server:run areapath) - (set! server-last-start (current-seconds)) - ;; (thread-sleep! 3) - (case mode - ((homehost) (cons #f #f)) - (else #f)))))) - -(define (server:get-servinfo-dir areapath) - (let* ((spath (conc areapath"/.servinfo"))) - (if (not (file-exists? spath)) - (create-directory spath #t)) - spath)) - -(define (server:clean-up-old areapath) - ;; any server file that has not been touched in ten minutes is effectively dead - (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) - (for-each - (lambda (sfile) - (let* ((modtime (handle-exceptions - exn - (begin - (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) - (current-seconds)) - (file-modification-time sfile)))) - (if (and (number? modtime) - (> (- (current-seconds) modtime) - 600)) - (begin - (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") - (handle-exceptions - exn - (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) - (delete-file sfile)))))) - sfiles))) - -;; would like to eventually get rid of this -;; -(define (common:on-homehost?) - (server:choose-server *toppath* 'home?)) - -;; kind start up of server, wait before allowing another server for a given -;; area to be launched -;; -(define (server:kind-run areapath) - ;; look for $MT_RUN_AREA_HOME/logs/server-start-last - ;; and wait for it to be at least seconds old - ;; (server:wait-for-server-start-last-flag areapath) - (let loop () - (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) - (begin - (if (common:low-noise-print 30 "our-host-load") - (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) - (loop)))) - (if (< (server:choose-server areapath 'count) 20) - (server:run areapath)) - #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? - (let* ((lock-file (conc areapath "/logs/server-start.lock"))) - (let* ((start-flag (conc areapath "/logs/server-start-last"))) - (common:simple-file-lock-and-wait lock-file expire-time: 25) - (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) - (system (conc "touch " start-flag)) ;; lazy but safe - (server:run areapath) - (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". - (common:simple-file-release-lock lock-file))) - (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) - -;; this one seems to be the general entry point -;; -(define (server:start-and-wait areapath #!key (timeout 60)) - (let ((give-up-time (+ (current-seconds) timeout))) - (let loop ((server-info (server:check-if-running areapath)) - (try-num 0)) - (if (or server-info - (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. - (server:record->url server-info) - (let* ( (servers (server:choose-server areapath 'all-valid)) - (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) - (if (and (> try-num 0) ;; first time through simply wait a little while then try again - (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one - (server:run areapath)) - (thread-sleep! 5) - (loop (server:check-if-running areapath) - (+ try-num 1))))))) - -(define (server:get-num-servers #!key (numservers 2)) - (let ((ns (string->number - (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) - (or ns numservers))) - -;; no longer care if multiple servers are started by accident. older servers will drop off in time. -;; -(define (server:check-if-running areapath) ;; #!key (numservers "2")) - (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed - (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) - (if (or (and servers - (null? servers)) - (not servers)) - ;; (and (list? servers) - ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers - #f - (let loop ((hed (car servers)) - (tal (cdr servers))) - (let ((res (server:check-server hed))) - (if res - hed - (if (null? tal) - #f - (loop (car tal)(cdr tal))))))))) - -;; ping the given server -;; -(define (server:check-server server-record) - (let* ((server-url (server:record->url server-record)) - (server-id (server:record->id server-record)) - (res (server:ping server-url server-id))) - (if res - server-url - #f))) - -(define (server:kill servr) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) - #f) - (match-let (((mod-time hostname port start-time server-id pid) - servr)) - (tasks:kill-server hostname pid)))) - -;; called in megatest.scm, host-port is string hostname:port -;; -;; NOTE: This is NOT called directly from clients as not all transports support a client running -;; in the same process as the server. -;; -(define (server:ping host:port server-id #!key (do-exit #f)) - (let* ((host-port (cond - ((string? host:port) - (let ((slst (string-split host:port ":"))) - (if (eq? (length slst) 2) - (list (car slst)(string->number (cadr slst))) - #f))) - (else - #f)))) - (cond - ((and (list? host-port) - (eq? (length host-port) 2)) - (let* ((myrunremote (make-remote)) - (iface (car host-port)) - (port (cadr host-port)) - (server-dat (client:connect iface port server-id myrunremote)) - (login-res (rmt:login-no-auto-client-setup myrunremote))) - (if (and (list? login-res) - (car login-res)) - (begin - ;; (print "LOGIN_OK") - (if do-exit (exit 0)) - #t) - (begin - ;; (print "LOGIN_FAILED") - (if do-exit (exit 1)) - #f)))) - (else - (if host:port - (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) - (if do-exit - (exit 1) - #f))))) - -;; run ping in separate process, safest way in some cases -;; -(define (server:ping-server ifaceport) - (with-input-from-pipe - (conc (common:get-megatest-exe) " -ping " ifaceport) - (lambda () - (let loop ((inl (read-line)) - (res "NOREPLY")) - (if (eof-object? inl) - (case (string->symbol res) - ((NOREPLY) #f) - ((LOGIN_OK) #t) - (else #f)) - (loop (read-line) inl)))))) - -;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). -;; -(define (server:login toppath) - (lambda (toppath) - (set! *db-last-access* (current-seconds)) ;; might not be needed. - (if (equal? *toppath* toppath) - #t - #f))) - -;; timeout is hms string: 1h 5m 3s, default is 1 minute -;; This is currently broken. Just use the number of hours with no unit. -;; Default is 60 seconds. -;; -(define (server:expiration-timeout) - (let ((tmo (configf:lookup *configdat* "server" "timeout"))) - (if (and (string? tmo) - (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below - (* 3600 (string->number tmo)) - 600))) - -(define (server:get-best-guess-address hostname) - (let ((res #f)) - (for-each - (lambda (adr) - (if (not (eq? (u8vector-ref adr 0) 127)) - (set! res adr))) - ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME - (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) - (string-intersperse - (map number->string - (u8vector->list - (if res res (hostname->ip hostname)))) "."))) - -;; (define server:sync-lock-token "SERVER_SYNC_LOCK") -;; (define (server:release-sync-lock) -;; (db:no-sync-del! *no-sync-db* server:sync-lock-token)) -;; (define (server:have-sync-lock?) -;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token)) -;; (have-lock? (car have-lock-pair)) -;; (lock-time (cdr have-lock-pair)) -;; (lock-age (- (current-seconds) lock-time))) -;; (cond -;; (have-lock? #t) -;; ((>lock-age -;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) -;; (server:release-sync-lock) -;; (server:have-sync-lock?)) -;; (else #f)))) - -;; moving this here as it needs access to db and cannot be in common. -;; - -(define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) - (debug:print "WARNING: bruteforce-syncer is called but has been disabled!") - (lambda () - (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")) - #;(let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh - (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log"))) - (tmp-area (common:get-db-tmp-area)) - (tmp-db (conc tmp-area "/megatest.db")) - (staging-file (conc *toppath* "/.megatest.db")) - (mtdbfile (conc *toppath* "/megatest.db")) - (lockfile (common:get-sync-lock-filepath)) - (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) - (sync-cmd (if fork-to-background - (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"") - sync-cmd-core)) - (default-min-intersync-delay 2) - (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay)) - (default-duty-cycle 0.1) - (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle)) - (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle) - (calculate-off-time (lambda (work-duration duty-cycle) - (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds))) - (off-time min-intersync-delay) ;; adjusted in closure below. - (do-a-sync - (lambda () - (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync) - (let* ((finalres - (let retry-loop ((num-tries 0)) - (if (common:simple-file-lock lockfile) - (begin - (cond - ((not (or fork-to-background persist-until-sync)) - (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay - " , off-time="off-time" seconds ]") - (thread-sleep! (max off-time min-intersync-delay))) - (else - (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) - - (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) - (common:snapshot-file mtdbfile subdir: ".db-snapshot")) - (delete-file* staging-file) - (let* ((start-time (current-milliseconds)) - (res (system sync-cmd)) - (dbbackupfile (conc mtdbfile ".backup")) - (res2 - (cond - ((eq? 0 res ) - (handle-exceptions - exn - #f - (if (file-exists? dbbackupfile) - (delete-file* dbbackupfile) - ) - (if (eq? 0 (file-size sync-log)) - (delete-file* sync-log)) - (system (conc "/bin/mv " staging-file " " mtdbfile)) - - (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) - (set! off-time (calculate-off-time - last-sync-seconds - (cond - ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) - duty-cycle) - (else - (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) - default-duty-cycle)))) - - (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") - (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) - 'sync-completed)) - (else - (system (conc "/bin/cp "sync-log" "sync-log".fail")) - (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") - (if (file-exists? (conc mtdbfile ".backup")) - (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) - #f)))) - (common:simple-file-release-lock lockfile) - (BB> "released lockfile: " lockfile) - (when (common:file-exists? lockfile) - (BB> "DID NOT ACTUALLY RELEASE LOCKFILE")) - res2) ;; end let - );; end begin - ;; else - (cond - (persist-until-sync - (thread-sleep! 1) - (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)") - (retry-loop (add1 num-tries))) - (else - (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay))) - (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.") - 'parallel-sync-in-progress)) - ) ;; end if got lockfile - ) - )) - (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres) - finalres) - ) ;; end lambda - )) - do-a-sync)) - +;; (require-extension (srfi 18) extras tcp s11n) +;; +;; (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest +;; directory-utils posix-extras matchable utils) +;; +;; (use spiffy uri-common intarweb http-client spiffy-request-vars) +;; +;; (declare (unit server)) +;; +;; (declare (uses commonmod)) +;; +;; (declare (uses common)) +;; (declare (uses db)) +;; (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. +;; ;; (declare (uses synchash)) +;; (declare (uses http-transport)) +;; ;;(declare (uses rpc-transport)) +;; (declare (uses launch)) +;; ;; (declare (uses daemon)) +;; +;; (import commonmod) +;; +;; (include "common_records.scm") +;; (include "db_records.scm") +;; +;; (define (server:make-server-url hostport) +;; (if (not hostport) +;; #f +;; (conc "http://" (car hostport) ":" (cadr hostport)))) +;; +;; (define *server-loop-heart-beat* (current-seconds)) +;; +;; ;;====================================================================== +;; ;; P K T S S T U F F +;; ;;====================================================================== +;; +;; ;; ??? +;; +;; ;;====================================================================== +;; ;; P K T S S T U F F +;; ;;====================================================================== +;; +;; ;; ??? +;; +;; ;;====================================================================== +;; ;; S E R V E R +;; ;;====================================================================== +;; +;; ;; Call this to start the actual server +;; ;; +;; +;; ;;====================================================================== +;; ;; S E R V E R U T I L I T I E S +;; ;;====================================================================== +;; +;; ;; Get the transport +;; (define (server:get-transport) +;; (if *transport-type* +;; *transport-type* +;; (let ((ttype (string->symbol +;; (or (args:get-arg "-transport") +;; (configf:lookup *configdat* "server" "transport") +;; "rpc")))) +;; (set! *transport-type* ttype) +;; ttype))) +;; +;; ;; Generate a unique signature for this server +;; (define (server:mk-signature) +;; (message-digest-string (md5-primitive) +;; (with-output-to-string +;; (lambda () +;; (write (list (current-directory) +;; (current-process-id) +;; (argv))))))) +;; +;; (define (server:get-client-signature) +;; (if *my-client-signature* *my-client-signature* +;; (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic +;; (set! *my-client-signature* sig) +;; *my-client-signature*))) +;; +;; (define (server:get-server-id) +;; (if *server-id* *server-id* +;; (let ((sig (server:mk-signature))) ;; clients re-use the server:mk-signature logic +;; (set! *server-id* sig) +;; *server-id*))) +;; +;; ;; When using zmq this would send the message back (two step process) +;; ;; with spiffy or rpc this simply returns the return data to be returned +;; ;; +;; (define (server:reply return-addr query-sig success/fail result) +;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) +;; ;; (send-message pubsock target send-more: #t) +;; ;; (send-message pubsock +;; (case (server:get-transport) +;; ((rpc) (db:obj->string (vector success/fail query-sig result))) +;; ((http) (db:obj->string (vector success/fail query-sig result))) +;; ((fs) result) +;; (else +;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) +;; result))) +;; +;; ;; Given an area path, start a server process ### NOTE ### > file 2>&1 +;; ;; if the target-host is set +;; ;; try running on that host +;; ;; incidental: rotate logs in logs/ dir. +;; ;; +;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area +;; (let* ((testsuite (common:get-testsuite-name)) +;; (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) +;; (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") +;; "")) +;; (cmdln (conc (common:get-megatest-exe) +;; " -server - ";; (or target-host "-") +;; (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") +;; " -daemonize " +;; "") +;; ;; " -log " logfile +;; " -m testsuite:" testsuite +;; " " profile-mode +;; )) ;; (conc " >> " logfile " 2>&1 &"))))) +;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? +;; (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) +;; ;; we want the remote server to start in *toppath* so push there +;; (push-directory areapath) +;; (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") +;; (thread-start! log-rotate) +;; +;; ;; host.domain.tld match host? +;; ;; (if (and target-host +;; ;; ;; look at target host, is it host.domain.tld or ip address and does it +;; ;; ;; match current ip or hostname +;; ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) +;; ;; (not (equal? curr-ip target-host))) +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) +;; ;; (setenv "TARGETHOST" target-host))) +;; ;; +;; (setenv "TARGETHOST_LOGF" logfile) +;; (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time +;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) +;; (system (conc "nbfake " cmdln)) +;; (unsetenv "TARGETHOST_LOGF") +;; ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) +;; (thread-join! log-rotate) +;; (pop-directory))) +;; +;; ;; given a path to a server log return: host port startseconds server-id +;; ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let +;; ;; example of what it's looking for in the log file: +;; ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 +;; +;; (define (server:logf-get-start-info logf) +;; (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id +;; (dbprep-rx (regexp "^SERVER: dbprep")) +;; (dbprep-found 0) +;; (bad-dat (list #f #f #f #f #f))) +;; (handle-exceptions +;; exn +;; (begin +;; ;; WARNING: this is potentially dangerous to blanket ignore the errors +;; (if (file-exists? logf) +;; (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) +;; bad-dat) ;; no idea what went wrong, call it a bad server +;; (with-input-from-file +;; logf +;; (lambda () +;; (let loop ((inl (read-line)) +;; (lnum 0)) +;; (if (not (eof-object? inl)) +;; (let ((mlst (string-match server-rx inl)) +;; (dbprep (string-match dbprep-rx inl))) +;; (if dbprep (set! dbprep-found 1)) +;; (if (not mlst) +;; (if (< lnum 500) ;; give up if more than 500 lines of server log read +;; (loop (read-line)(+ lnum 1)) +;; (begin +;; (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) +;; bad-dat)) +;; (match mlst +;; ((_ host port start server-id pid) +;; (list host +;; (string->number port) +;; (string->number start) +;; server-id +;; (string->number pid))) +;; (else +;; (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) +;; bad-dat)))) +;; (begin +;; (if dbprep-found +;; (begin +;; (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) +;; (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? +;; (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) +;; bad-dat)))))))) +;; +;; ;; ;; get a list of servers from the log files, with all relevant data +;; ;; ;; ( mod-time host port start-time pid ) +;; ;; ;; +;; ;; (define (server:get-list areapath #!key (limit #f)) +;; ;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) +;; ;; (day-seconds (* 24 60 60))) +;; ;; ;; if the directory exists continue to get the list +;; ;; ;; otherwise attempt to create the logs dir and then +;; ;; ;; continue +;; ;; (if (if (directory-exists? (conc areapath "/logs")) +;; ;; '() +;; ;; (if (file-write-access? areapath) +;; ;; (begin +;; ;; (condition-case +;; ;; (create-directory (conc areapath "/logs") #t) +;; ;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) +;; ;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) +;; ;; (directory-exists? (conc areapath "/logs"))) +;; ;; '())) +;; ;; +;; ;; ;; Get the list of server logs. +;; ;; (let* ( +;; ;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. +;; ;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) +;; ;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) +;; ;; (num-serv-logs (length server-logs))) +;; ;; (if (or (null? server-logs) (= num-serv-logs 0)) +;; ;; (let () +;; ;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) +;; ;; '() +;; ;; ) +;; ;; (let loop ((hed (string-chomp (car server-logs))) +;; ;; (tal (cdr server-logs)) +;; ;; (res '())) +;; ;; (let* ((mod-time (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) +;; ;; (current-seconds)) ;; 0 +;; ;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted +;; ;; (down-time (- (current-seconds) mod-time)) +;; ;; (serv-dat (if (or (< num-serv-logs 10) +;; ;; (< down-time 900)) ;; day-seconds)) +;; ;; (server:logf-get-start-info hed) +;; ;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at +;; ;; (serv-rec (cons mod-time serv-dat)) +;; ;; (fmatch (string-match fname-rx hed)) +;; ;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) +;; ;; (new-res (if (null? serv-dat) +;; ;; res +;; ;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let +;; ;; (if (null? tal) +;; ;; (if (and limit +;; ;; (> (length new-res) limit)) +;; ;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work +;; ;; new-res) +;; ;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) +;; +;; #;(define (server:get-num-alive srvlst) +;; (let ((num-alive 0)) +;; (for-each +;; (lambda (server) +;; (handle-exceptions +;; exn +;; (begin +;; (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) +;; (match-let (((mod-time host port start-time server-id pid) +;; server)) +;; (let* ((uptime (- (current-seconds) mod-time)) +;; (runtime (if start-time +;; (- mod-time start-time) +;; 0))) +;; (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) +;; srvlst) +;; num-alive)) +;; +;; ;; ;; given a list of servers get a list of valid servers, i.e. at least +;; ;; ;; 10 seconds old, has started and is less than 1 hour old and is +;; ;; ;; active (i.e. mod-time < 10 seconds +;; ;; ;; +;; ;; ;; mod-time host port start-time pid +;; ;; ;; +;; ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off +;; ;; ;; and servers should stick around for about two hours or so. +;; ;; ;; +;; ;; (define (server:get-best srvlst) +;; ;; (let* ((nums (server:get-num-servers)) +;; ;; (now (current-seconds)) +;; ;; (slst (sort +;; ;; (filter (lambda (rec) +;; ;; (if (and (list? rec) +;; ;; (> (length rec) 2)) +;; ;; (let ((start-time (list-ref rec 3)) +;; ;; (mod-time (list-ref rec 0))) +;; ;; ;; (print "start-time: " start-time " mod-time: " mod-time) +;; ;; (and start-time mod-time +;; ;; (> (- now start-time) 0) ;; been running at least 0 seconds +;; ;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds +;; ;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set +;; ;; (< (- now start-time) +;; ;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) +;; ;; 180) +;; ;; (random 360)))) ;; under one hour running time +/- 180 +;; ;; )) +;; ;; #f)) +;; ;; srvlst) +;; ;; (lambda (a b) +;; ;; (< (list-ref a 3) +;; ;; (list-ref b 3)))))) +;; ;; (if (> (length slst) nums) +;; ;; (take slst nums) +;; ;; slst))) +;; +;; ;; ;; switch from server:get-list to server:get-servers-info +;; ;; ;; +;; ;; (define (server:get-first-best areapath) +;; ;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; ;; (if (and srvrs +;; ;; (not (null? srvrs))) +;; ;; (car srvrs) +;; ;; #f))) +;; ;; +;; ;; (define (server:get-rand-best areapath) +;; ;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; ;; (if (and (list? srvrs) +;; ;; (not (null? srvrs))) +;; ;; (let* ((len (length srvrs)) +;; ;; (idx (random len))) +;; ;; (list-ref srvrs idx)) +;; ;; #f))) +;; +;; (define (server:record->id servr) +;; (handle-exceptions +;; exn +;; (begin +;; (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) +;; #f) +;; (match-let (((host port start-time server-id pid) +;; servr)) +;; (if server-id +;; server-id +;; #f)))) +;; +;; (define (server:record->url servr) +;; (handle-exceptions +;; exn +;; (begin +;; (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) +;; #f) +;; (match-let (((host port start-time server-id pid) +;; servr)) +;; (if (and host port) +;; (conc host ":" port) +;; #f)))) +;; +;; +;; ;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. +;; ;; if it is old enough, overwrite it and wait 0.25 seconds. +;; ;; if it then has the wrong server key, wait + 1 and call this function recursively. +;; ;; +;; #;(define (server:wait-for-server-start-last-flag areapath) +;; (let* ((start-flag (conc areapath "/logs/server-start-last")) +;; ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) +;; (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) +;; (server-key (conc (get-host-name) "-" (current-process-id)))) +;; (if (file-exists? start-flag) +;; (let* ((fmodtime (file-modification-time start-flag)) +;; (delta (- (current-seconds) fmodtime)) +;; (old-enough (> delta idletime)) +;; (new-server-key "")) +;; ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. +;; ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. +;; (if (and old-enough +;; (begin +;; (debug:print-info 2 *default-log-port* "Writing " start-flag) +;; (with-output-to-file start-flag (lambda () (print server-key))) +;; (thread-sleep! 0.25) +;; (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) +;; (equal? server-key new-server-key))) +;; #t +;; ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. +;; (begin +;; (debug:print-info 0 *default-log-port* "Gating server start, last start: " +;; (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) +;; +;; (thread-sleep! ( + 1 idletime)) +;; (server:wait-for-server-start-last-flag areapath))))))) +;; +;; ;; oldest server alive determines host then choose random of youngest +;; ;; five servers on that host +;; ;; +;; (define (server:get-servers-info areapath) +;; ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") +;; (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) +;; (if (not (file-exists? servinfodir)) +;; (create-directory servinfodir)) +;; (let* ((allfiles (glob (conc servinfodir"/*"))) +;; (res (make-hash-table))) +;; (for-each +;; (lambda (f) +;; (let* ((hostport (pathname-strip-directory f)) +;; (serverdat (server:logf-get-start-info f))) +;; (match serverdat +;; ((host port start server-id pid) +;; (if (and host port start server-id pid) +;; (hash-table-set! res hostport serverdat) +;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) +;; (else +;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) +;; allfiles) +;; res))) +;; +;; ;; check the .servinfo directory, are there other servers running on this +;; ;; or another host? +;; ;; +;; ;; returns #t => ok to start another server +;; ;; #f => not ok to start another server +;; ;; +;; (define (server:minimal-check areapath) +;; (server:clean-up-old areapath) +;; (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) +;; (servrs (glob (conc srvdir"/*"))) +;; (thishostip (server:get-best-guess-address (get-host-name))) +;; (thisservrs (glob (conc srvdir"/"thishostip":*"))) +;; (homehostinf (server:choose-server areapath 'homehost)) +;; (havehome (car homehostinf)) +;; (wearehome (cdr homehostinf))) +;; (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome +;; ", numservers: "(length thisservrs)) +;; (cond +;; ((not havehome) #t) ;; no homehost yet, go for it +;; ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another +;; ((and havehome (not wearehome)) #f) ;; we are not the home host +;; ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running +;; (else +;; (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) +;; #t)))) +;; +;; +;; (define server-last-start 0) +;; +;; +;; ;; oldest server alive determines host then choose random of youngest +;; ;; five servers on that host +;; ;; +;; ;; mode: +;; ;; best - get best server (random of newest five) +;; ;; home - get home host based on oldest server +;; ;; info - print info +;; (define (server:choose-server areapath #!optional (mode 'best)) +;; ;; age is current-starttime +;; ;; find oldest alive +;; ;; 1. sort by age ascending and ping until good +;; ;; find alive rand from youngest +;; ;; 1. sort by age descending +;; ;; 2. take five +;; ;; 3. check alive, discard if not and repeat +;; ;; first we clean up old server files +;; (server:clean-up-old areapath) +;; (let* ((since-last (- (current-seconds) server-last-start)) +;; (server-start-delay 10)) +;; (if ( < (- (current-seconds) server-last-start) 10 ) +;; (begin +;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") +;; (thread-sleep! server-start-delay) +;; ) +;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; ) +;; ) +;; (let* ((serversdat (server:get-servers-info areapath)) +;; (servkeys (hash-table-keys serversdat)) +;; (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last +;; (sort servkeys ;; list of "host:port" +;; (lambda (a b) +;; (>= (list-ref (hash-table-ref serversdat a) 2) +;; (list-ref (hash-table-ref serversdat b) 2)))) +;; '()))) +;; (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) +;; (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) +;; (if (not (null? by-time-asc)) +;; (let* ((oldest (last by-time-asc)) +;; (oldest-dat (hash-table-ref serversdat oldest)) +;; (host (list-ref oldest-dat 0)) +;; (all-valid (filter (lambda (x) +;; (equal? host (list-ref (hash-table-ref serversdat x) 0))) +;; by-time-asc)) +;; (best-ten (lambda () +;; (if (> (length all-valid) 11) +;; (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out +;; (if (> (length all-valid) 8) +;; (drop-right all-valid 1) +;; all-valid)))) +;; (names->dats (lambda (names) +;; (map (lambda (x) +;; (hash-table-ref serversdat x)) +;; names))) +;; (am-home? (lambda () +;; (let* ((currhost (get-host-name)) +;; (bestadrs (server:get-best-guess-address currhost))) +;; (or (equal? host currhost) +;; (equal? host bestadrs)))))) +;; (case mode +;; ((info) +;; (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) +;; (print "youngest: "(hash-table-ref serversdat (car all-valid)))) +;; ((home) host) +;; ((homehost) (cons host (am-home?))) ;; shut up old code +;; ((home?) (am-home?)) +;; ((best-ten)(names->dats (best-ten))) +;; ((all-valid)(names->dats all-valid)) +;; ((best) (let* ((best-ten (best-ten)) +;; (len (length best-ten))) +;; (hash-table-ref serversdat (list-ref best-ten (random len))))) +;; ((count)(length all-valid)) +;; (else +;; (debug:print 0 *default-log-port* "ERROR: invalid command "mode) +;; #f))) +;; (begin +;; (server:run areapath) +;; (set! server-last-start (current-seconds)) +;; ;; (thread-sleep! 3) +;; (case mode +;; ((homehost) (cons #f #f)) +;; (else #f)))))) +;; +;; (define (server:get-servinfo-dir areapath) +;; (let* ((spath (conc areapath"/.servinfo"))) +;; (if (not (file-exists? spath)) +;; (create-directory spath #t)) +;; spath)) +;; +;; (define (server:clean-up-old areapath) +;; ;; any server file that has not been touched in ten minutes is effectively dead +;; (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) +;; (for-each +;; (lambda (sfile) +;; (let* ((modtime (handle-exceptions +;; exn +;; (begin +;; (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) +;; (current-seconds)) +;; (file-modification-time sfile)))) +;; (if (and (number? modtime) +;; (> (- (current-seconds) modtime) +;; 600)) +;; (begin +;; (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") +;; (handle-exceptions +;; exn +;; (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) +;; (delete-file sfile)))))) +;; sfiles))) +;; +;; ;; would like to eventually get rid of this +;; ;; +;; (define (common:on-homehost?) +;; (server:choose-server *toppath* 'home?)) +;; +;; ;; kind start up of server, wait before allowing another server for a given +;; ;; area to be launched +;; ;; +;; (define (server:kind-run areapath) +;; ;; look for $MT_RUN_AREA_HOME/logs/server-start-last +;; ;; and wait for it to be at least seconds old +;; ;; (server:wait-for-server-start-last-flag areapath) +;; (let loop () +;; (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) +;; (begin +;; (if (common:low-noise-print 30 "our-host-load") +;; (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) +;; (loop)))) +;; (if (< (server:choose-server areapath 'count) 20) +;; (server:run areapath)) +;; #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? +;; (let* ((lock-file (conc areapath "/logs/server-start.lock"))) +;; (let* ((start-flag (conc areapath "/logs/server-start-last"))) +;; (common:simple-file-lock-and-wait lock-file expire-time: 25) +;; (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) +;; (system (conc "touch " start-flag)) ;; lazy but safe +;; (server:run areapath) +;; (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". +;; (common:simple-file-release-lock lock-file))) +;; (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) +;; +;; ;; this one seems to be the general entry point +;; ;; +;; (define (server:start-and-wait areapath #!key (timeout 60)) +;; (let ((give-up-time (+ (current-seconds) timeout))) +;; (let loop ((server-info (server:check-if-running areapath)) +;; (try-num 0)) +;; (if (or server-info +;; (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. +;; (server:record->url server-info) +;; (let* ( (servers (server:choose-server areapath 'all-valid)) +;; (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) +;; (if (and (> try-num 0) ;; first time through simply wait a little while then try again +;; (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one +;; (server:run areapath)) +;; (thread-sleep! 5) +;; (loop (server:check-if-running areapath) +;; (+ try-num 1))))))) +;; +;; (define (server:get-num-servers #!key (numservers 2)) +;; (let ((ns (string->number +;; (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) +;; (or ns numservers))) +;; +;; ;; no longer care if multiple servers are started by accident. older servers will drop off in time. +;; ;; +;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) +;; (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed +;; (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) +;; (if (or (and servers +;; (null? servers)) +;; (not servers)) +;; ;; (and (list? servers) +;; ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers +;; #f +;; (let loop ((hed (car servers)) +;; (tal (cdr servers))) +;; (let ((res (server:check-server hed))) +;; (if res +;; hed +;; (if (null? tal) +;; #f +;; (loop (car tal)(cdr tal))))))))) +;; +;; ;; ping the given server +;; ;; +;; (define (server:check-server server-record) +;; (let* ((server-url (server:record->url server-record)) +;; (server-id (server:record->id server-record)) +;; (res (server:ping server-url server-id))) +;; (if res +;; server-url +;; #f))) +;; +;; (define (server:kill servr) +;; (handle-exceptions +;; exn +;; (begin +;; (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) +;; #f) +;; (match-let (((mod-time hostname port start-time server-id pid) +;; servr)) +;; (tasks:kill-server hostname pid)))) +;; +;; ;; called in megatest.scm, host-port is string hostname:port +;; ;; +;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running +;; ;; in the same process as the server. +;; ;; +;; (define (server:ping host:port server-id #!key (do-exit #f)) +;; (let* ((host-port (cond +;; ((string? host:port) +;; (let ((slst (string-split host:port ":"))) +;; (if (eq? (length slst) 2) +;; (list (car slst)(string->number (cadr slst))) +;; #f))) +;; (else +;; #f)))) +;; (cond +;; ((and (list? host-port) +;; (eq? (length host-port) 2)) +;; (let* ((myrunremote (make-remote)) +;; (iface (car host-port)) +;; (port (cadr host-port)) +;; (server-dat (client:connect iface port server-id myrunremote)) +;; (login-res (rmt:login-no-auto-client-setup myrunremote))) +;; (if (and (list? login-res) +;; (car login-res)) +;; (begin +;; ;; (print "LOGIN_OK") +;; (if do-exit (exit 0)) +;; #t) +;; (begin +;; ;; (print "LOGIN_FAILED") +;; (if do-exit (exit 1)) +;; #f)))) +;; (else +;; (if host:port +;; (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) +;; (if do-exit +;; (exit 1) +;; #f))))) +;; +;; ;; run ping in separate process, safest way in some cases +;; ;; +;; (define (server:ping-server ifaceport) +;; (with-input-from-pipe +;; (conc (common:get-megatest-exe) " -ping " ifaceport) +;; (lambda () +;; (let loop ((inl (read-line)) +;; (res "NOREPLY")) +;; (if (eof-object? inl) +;; (case (string->symbol res) +;; ((NOREPLY) #f) +;; ((LOGIN_OK) #t) +;; (else #f)) +;; (loop (read-line) inl)))))) +;; +;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). +;; ;; +;; (define (server:login toppath) +;; (lambda (toppath) +;; (set! *db-last-access* (current-seconds)) ;; might not be needed. +;; (if (equal? *toppath* toppath) +;; #t +;; #f))) +;; +;; ;; timeout is hms string: 1h 5m 3s, default is 1 minute +;; ;; This is currently broken. Just use the number of hours with no unit. +;; ;; Default is 60 seconds. +;; ;; +;; (define (server:expiration-timeout) +;; (let ((tmo (configf:lookup *configdat* "server" "timeout"))) +;; (if (and (string? tmo) +;; (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below +;; (* 3600 (string->number tmo)) +;; 600))) +;; +;; (define (server:get-best-guess-address hostname) +;; (let ((res #f)) +;; (for-each +;; (lambda (adr) +;; (if (not (eq? (u8vector-ref adr 0) 127)) +;; (set! res adr))) +;; ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME +;; (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) +;; (string-intersperse +;; (map number->string +;; (u8vector->list +;; (if res res (hostname->ip hostname)))) "."))) +;; +;; ;; (define server:sync-lock-token "SERVER_SYNC_LOCK") +;; ;; (define (server:release-sync-lock) +;; ;; (db:no-sync-del! *no-sync-db* server:sync-lock-token)) +;; ;; (define (server:have-sync-lock?) +;; ;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token)) +;; ;; (have-lock? (car have-lock-pair)) +;; ;; (lock-time (cdr have-lock-pair)) +;; ;; (lock-age (- (current-seconds) lock-time))) +;; ;; (cond +;; ;; (have-lock? #t) +;; ;; ((>lock-age +;; ;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) +;; ;; (server:release-sync-lock) +;; ;; (server:have-sync-lock?)) +;; ;; (else #f)))) +;; +;; ;; moving this here as it needs access to db and cannot be in common. +;; ;; +;; +;; (define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) +;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!") +;; (lambda () +;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")) +;; #;(let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh +;; (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log"))) +;; (tmp-area (common:get-db-tmp-area)) +;; (tmp-db (conc tmp-area "/megatest.db")) +;; (staging-file (conc *toppath* "/.megatest.db")) +;; (mtdbfile (conc *toppath* "/megatest.db")) +;; (lockfile (common:get-sync-lock-filepath)) +;; (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) +;; (sync-cmd (if fork-to-background +;; (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"") +;; sync-cmd-core)) +;; (default-min-intersync-delay 2) +;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay)) +;; (default-duty-cycle 0.1) +;; (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle)) +;; (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle) +;; (calculate-off-time (lambda (work-duration duty-cycle) +;; (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds))) +;; (off-time min-intersync-delay) ;; adjusted in closure below. +;; (do-a-sync +;; (lambda () +;; (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync) +;; (let* ((finalres +;; (let retry-loop ((num-tries 0)) +;; (if (common:simple-file-lock lockfile) +;; (begin +;; (cond +;; ((not (or fork-to-background persist-until-sync)) +;; (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay +;; " , off-time="off-time" seconds ]") +;; (thread-sleep! (max off-time min-intersync-delay))) +;; (else +;; (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) +;; +;; (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) +;; (common:snapshot-file mtdbfile subdir: ".db-snapshot")) +;; (delete-file* staging-file) +;; (let* ((start-time (current-milliseconds)) +;; (res (system sync-cmd)) +;; (dbbackupfile (conc mtdbfile ".backup")) +;; (res2 +;; (cond +;; ((eq? 0 res ) +;; (handle-exceptions +;; exn +;; #f +;; (if (file-exists? dbbackupfile) +;; (delete-file* dbbackupfile) +;; ) +;; (if (eq? 0 (file-size sync-log)) +;; (delete-file* sync-log)) +;; (system (conc "/bin/mv " staging-file " " mtdbfile)) +;; +;; (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) +;; (set! off-time (calculate-off-time +;; last-sync-seconds +;; (cond +;; ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) +;; duty-cycle) +;; (else +;; (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) +;; default-duty-cycle)))) +;; +;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") +;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) +;; 'sync-completed)) +;; (else +;; (system (conc "/bin/cp "sync-log" "sync-log".fail")) +;; (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") +;; (if (file-exists? (conc mtdbfile ".backup")) +;; (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) +;; #f)))) +;; (common:simple-file-release-lock lockfile) +;; (BB> "released lockfile: " lockfile) +;; (when (common:file-exists? lockfile) +;; (BB> "DID NOT ACTUALLY RELEASE LOCKFILE")) +;; res2) ;; end let +;; );; end begin +;; ;; else +;; (cond +;; (persist-until-sync +;; (thread-sleep! 1) +;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)") +;; (retry-loop (add1 num-tries))) +;; (else +;; (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay))) +;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.") +;; 'parallel-sync-in-progress)) +;; ) ;; end if got lockfile +;; ) +;; )) +;; (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres) +;; finalres) +;; ) ;; end lambda +;; )) +;; do-a-sync)) +;; +;; ADDED servermod.scm Index: servermod.scm ================================================================== --- /dev/null +++ servermod.scm @@ -0,0 +1,1111 @@ +;; Copyright 2006-2023, Matthew Welland. +;; +;; This file is part of Megatest. +;; +;; Megatest is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. +;; +;; Megatest is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with Megatest. If not, see . +;;====================================================================== + +(declare (unit servermod)) +(declare (uses artifacts)) + +(use md5 message-digest posix typed-records extras) + +(module servermod +* + +(import scheme + chicken + + extras + md5 + message-digest + ports + posix + srfi-18 + + typed-records + data-structures + + artifacts + ) + +(defstruct srv + (areapath #f) + (host #f) + (pid #f) + (type #f) + (sdir #f) ;; .server directory + (hdir #f) ;; .server/host.pid directory + (incoming #f) + (dbstruct #f) + (handler #f) + (obj-to-str #f) + (str-to-obj #f) + ) + +;; nearly every process in Megatest (if write access) starts a server so it +;; can receive messages to exit on request +;; servers have a type, mtserve, dboard, runner, execute? TOO COMPLICATED. + +;; one server per run db file would be ideal. + +;; mtrah/.servers/./incoming/*.artifact +;; | `attic +;; | +;; `outgoing/./*.artifact +;; | `attic +;; `.host:port + +;; on exit processes clean up. only mtserv or dboard clean up abandoned records? + +;; IDEA: All requests could go into one directory instead of server specific directory - need locking +;; don't get multiple processing of arfs + +;; server:setup - setup the directory +;; server:launch - start a new mtserve process, possibly +;; using a launcher +;; server:run - run the long running thread that monitors +;; the .server area +;; server:exit - shutdown the server and exit +;; server:handle-request - take incoming request, process it, send response +;; back via best or fastest available transport + +;; call this with handler that takes dbstruct cmd and params after doing server:setup +;; and before starting server:run +;; +(define (server:set-handler srvdat handler) + (srv-handler-set! srvdat handler)) + +;; set up the server area and return a server struct +;; NOTE: This will need to be gated by write-access +;; +(define (server:setup areapath) + (let* ((srvdat (make-srv + areapath: areapath + host: (get-host-name) ;; likely need to replace with ip address + pid: (current-process-id) + sdir: (conc areapath"/.server") ;; put server artifacts here + )) + (hdir (conc (srv-sdir srvdat)"/"(get-host.pid srvdat)))) + (srv-hdir-set! srvdat hdir) + (srv-incoming-set! srvdat (conc hdir"/incoming")) + (create-directory hdir #t) + (for-each (lambda (d) + (create-directory (conc hdir"/"d))) + '("incoming" "responses")) + srvdat)) + +(define *server-keep-running* #f) + +;; to cleanly shut the server down set *server-keep-running* to #f +;; +(define (server:run srvdat) + ;; create server arf + ;; put arf in srvdat-dir + ;; forever + ;; scan incoming dir + ;; foreach arf + ;; bundle into with-transaction, no-transaction + ;; foreach bundle + ;; process the request + ;; create results arf and write it to clients dir + ;; remove in-arf from incoming + (let* ((areapath (srv-areapath srvdat)) + (sdir (srv-sdir srvdat)) + (hdir (srv-hdir srvdat)) + (myarf `((h . ,(srv-host srvdat)) + (i . ,(srv-pid srvdat)) + (d . ,hdir))) + (myuuid (write-alist->artifact sdir myarf ptype: 'S)) + (arf-fname (get-artifact-fname sdir myuuid)) + (dbstruct (srv-dbstruct srvdat))) + (set! *server-keep-running* #t) + (let loop ((last-access (current-seconds))) + (let* ((start (current-milliseconds)) + (res (server:process-incoming srvdat)) + (delta (- (current-milliseconds) start)) + (timed-out (> (- (current-seconds) last-access) + 60))) ;; accessed in last 60 seconds + (if timed-out + (begin + (print "INFO: server has not been accessed in 60 seconds, exiting shortly.") + (set! *server-keep-running* #f)) + (thread-sleep! (if (> delta 500) + 0.1 + 0.9))) + (if (or (> res 0) ;; res is the number of requests that were found and processed + *server-keep-running*) + (loop (if (> res 0) + (current-seconds) + last-access) + )))) + (delete-file arf-fname) + )) + +;; read arfs from incoming, process them and put result arfs in proper dirs +;; return number requests found and processed +;; +(define (server:process-incoming srvdat) + (let* ((sdir (srv-sdir srvdat)) + (hdir (srv-hdir srvdat)) + (indir (srv-incoming srvdat)) + (arfs (glob (conc indir"/*.artifacts"))) + (handler (srv-handler srvdat)) + (obj->string (srv-obj-to-str srvdat)) + (dbstruct (srv-dbstruct srvdat))) + (let loop ((rem arfs)) + (if (not (null? arfs)) + (let* ((arf (car rem)) + (dat (read-artifact->alist arf)) + (ruuid (alist-ref 'Z dat)) + (host (alist-ref 'h dat)) + (pid (alist-ref 'i dat)) + (dest (conc sdir"/"host"."pid"/responses")) ;; the calling host area + (cmd (alist-ref 'c dat)) + (params (alist-ref 'p dat)) + (res (handler dbstruct cmd params)) + (narf `((r . ,(obj->string res)) + (P . ,ruuid)))) + (delete-file arf) ;; add ability to save in bundles in archive area + (write-alist->artifact dest narf ptype: 'Q) + (loop (cdr rem))))) + (length arfs))) + +;; start a server process (NOT start server in this process) +;; +;; maybe check load before calling this? +(define (server:launch areapath) + (let* ((logd (conc areapath"/logs")) + (logf (conc logd"/from-"(get-host.pid #f)".log"))) + (if (not (file-exists? logd))(create-directory logd #t)) + (setenv "NBFAKE_LOG" logf) + (system (conc "nbfake mtserve -start-dir "areapath)))) + + + +;;====================================================================== +;; OLD SERVER STUFF BELOW HERE +;;====================================================================== + +;; ;; servers start by setting up fs transport +;; ;; and put a flag file for that ASAP. +;; ;; they then set up tcp and put a flag file for +;; ;; that +;; ;; +;; (define *client-server-id* #f) +;; +;; ;; oldest server alive determines host then choose random of youngest +;; ;; five servers on that host +;; ;; +;; ;; mode: +;; ;; best - get best server (random of newest five) +;; ;; home - get home host based on oldest server +;; ;; info - print info +;; (define (server:choose-server areapath #!optional (mode 'best)) +;; ;; age is current-starttime +;; ;; find oldest alive +;; ;; 1. sort by age ascending and ping until good +;; ;; find alive rand from youngest +;; ;; 1. sort by age descending +;; ;; 2. take five +;; ;; 3. check alive, discard if not and repeat +;; ;; first we clean up old server files +;; (server:clean-up-old areapath) +;; ;; (let* ((since-last (- (current-seconds) server-last-start)) +;; ;; (server-start-delay 10)) +;; ;; (if ( < (- (current-seconds) server-last-start) 10 ) +;; ;; (begin +;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") +;; ;; (thread-sleep! server-start-delay) +;; ;; ) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; ;; ) +;; (let* ((serversdat (server:get-servers-info areapath)) +;; (servkeys (hash-table-keys serversdat)) +;; (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last +;; (sort servkeys ;; list of "host:port" +;; (lambda (a b) +;; (>= (list-ref (hash-table-ref serversdat a) 2) +;; (list-ref (hash-table-ref serversdat b) 2)))) +;; '()))) +;; (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) +;; (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) +;; (if (not (null? by-time-asc)) +;; (let* ((oldest (last by-time-asc)) +;; (oldest-dat (hash-table-ref serversdat oldest)) +;; (host (list-ref oldest-dat 0)) +;; (all-valid (filter (lambda (x) +;; (equal? host (list-ref (hash-table-ref serversdat x) 0))) +;; by-time-asc)) +;; (best-ten (lambda () +;; (if (> (length all-valid) 11) +;; (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out +;; (if (> (length all-valid) 8) +;; (drop-right all-valid 1) +;; all-valid)))) +;; (names->dats (lambda (names) +;; (map (lambda (x) +;; (hash-table-ref serversdat x)) +;; names))) +;; (am-home? (lambda () +;; (let* ((currhost (get-host-name)) +;; (bestadrs (server:get-best-guess-address currhost))) +;; (or (equal? host currhost) +;; (equal? host bestadrs)))))) +;; (case mode +;; ((info) +;; (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) +;; (print "youngest: "(hash-table-ref serversdat (car all-valid)))) +;; ((home) host) +;; ((homehost) (cons host (am-home?))) ;; shut up old code +;; ((home?) (am-home?)) +;; ((best-ten)(names->dats (best-ten))) +;; ((all-valid)(names->dats all-valid)) +;; ((best) (let* ((best-ten (best-ten)) +;; (len (length best-ten))) +;; (hash-table-ref serversdat (list-ref best-ten (random len))))) +;; ((count)(length all-valid)) +;; (else +;; (debug:print 0 *default-log-port* "ERROR: invalid command "mode) +;; #f))) +;; (begin +;; (server:run areapath) +;; (set! server-last-start (current-seconds)) +;; ;; (thread-sleep! 3) +;; (case mode +;; ((homehost) (cons #f #f)) +;; (else #f)))))) + +;;====================================================================== +;; S E R V E R U T I L I T I E S +;;====================================================================== + +(define (server:get-servinfo-dir areapath) + (let* ((spath (conc areapath"/.servinfo"))) + (if (not (file-exists? spath)) + (create-directory spath #t)) + spath)) + +;; ;; Generate a unique signature for this server +;; (define (mk-signature) +;; (message-digest-string (md5-primitive) +;; (with-output-to-string +;; (lambda () +;; (write (list (current-directory) +;; (current-process-id) +;; (argv))))))) +;; +;; (define (server:clean-up-old areapath) +;; ;; any server file that has not been touched in ten minutes is effectively dead +;; (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) +;; (for-each +;; (lambda (sfile) +;; (let* ((modtime (handle-exceptions +;; exn +;; (begin +;; (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) +;; (current-seconds)) +;; (file-modification-time sfile)))) +;; (if (and (number? modtime) +;; (> (- (current-seconds) modtime) +;; 600)) +;; (begin +;; (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") +;; (handle-exceptions +;; exn +;; (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) +;; (delete-file sfile)))))) +;; sfiles))) +;; +;; (define (get-client-server-id) +;; (if *client-server-id* *client-server-id* +;; (let ((sig (mk-signature))) ;; clients re-use the server:mk-signature logic +;; (set! *client-server-id* sig) +;; *client-server-id*))) + +;; if srvdat is #f calculate host.pid +(define (get-host.pid srvdat) + (if srvdat + (conc (srv-host srvdat)"."(srv-pid srvdat)) + (conc (get-host-name)"."(current-process-id)))) + +;; ;; ;; When using zmq this would send the message back (two step process) +;; ;; ;; with spiffy or rpc this simply returns the return data to be returned +;; ;; ;; +;; ;; (define (server:reply return-addr query-sig success/fail result) +;; ;; (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) +;; ;; ;; (send-message pubsock target send-more: #t) +;; ;; ;; (send-message pubsock +;; ;; (case (server:get-transport) +;; ;; ((rpc) (db:obj->string (vector success/fail query-sig result))) +;; ;; ((http) (db:obj->string (vector success/fail query-sig result))) +;; ;; ((fs) result) +;; ;; (else +;; ;; (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) +;; ;; result))) +;; ;; +;; ;; ;; Given an area path, start a server process ### NOTE ### > file 2>&1 +;; ;; ;; if the target-host is set +;; ;; ;; try running on that host +;; ;; ;; incidental: rotate logs in logs/ dir. +;; ;; ;; +;; ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area +;; ;; (let* ((testsuite (common:get-testsuite-name)) +;; ;; (logfile (conc areapath "/logs/server.log")) ;; -" curr-pid "-" target-host ".log")) +;; ;; (profile-mode (or (configf:lookup *configdat* "misc" "profilesw") +;; ;; "")) +;; ;; (cmdln (conc (common:get-megatest-exe) +;; ;; " -server - ";; (or target-host "-") +;; ;; (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") +;; ;; " -daemonize " +;; ;; "") +;; ;; ;; " -log " logfile +;; ;; " -m testsuite:" testsuite +;; ;; " " profile-mode +;; ;; )) ;; (conc " >> " logfile " 2>&1 &"))))) +;; ;; (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? +;; ;; (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) +;; ;; ;; we want the remote server to start in *toppath* so push there +;; ;; (push-directory areapath) +;; ;; (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") +;; ;; (thread-start! log-rotate) +;; ;; +;; ;; ;; host.domain.tld match host? +;; ;; ;; (if (and target-host +;; ;; ;; ;; look at target host, is it host.domain.tld or ip address and does it +;; ;; ;; ;; match current ip or hostname +;; ;; ;; (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) +;; ;; ;; (not (equal? curr-ip target-host))) +;; ;; ;; (begin +;; ;; ;; (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) +;; ;; ;; (setenv "TARGETHOST" target-host))) +;; ;; ;; +;; ;; (setenv "TARGETHOST_LOGF" logfile) +;; ;; (thread-sleep! (/ (random 3000) 1000)) ;; add a random initial delay. It seems pretty common that many running tests request a server at the same time +;; ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) +;; ;; (system (conc "nbfake " cmdln)) +;; ;; (unsetenv "TARGETHOST_LOGF") +;; ;; ;; (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) +;; ;; (thread-join! log-rotate) +;; ;; (pop-directory))) +;; ;; +;; ;; ;; given a path to a server log return: host port startseconds server-id +;; ;; ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let +;; ;; ;; example of what it's looking for in the log file: +;; ;; ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 +;; ;; +;; ;; (define (server:logf-get-start-info logf) +;; ;; (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+)")) ;; SERVER STARTED: host:port AT timesecs server id +;; ;; (dbprep-rx (regexp "^SERVER: dbprep")) +;; ;; (dbprep-found 0) +;; ;; (bad-dat (list #f #f #f #f #f))) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; ;; WARNING: this is potentially dangerous to blanket ignore the errors +;; ;; (if (file-exists? logf) +;; ;; (debug:print-info 2 *default-log-port* "Unable to get server info from "logf", exn=" exn)) +;; ;; bad-dat) ;; no idea what went wrong, call it a bad server +;; ;; (with-input-from-file +;; ;; logf +;; ;; (lambda () +;; ;; (let loop ((inl (read-line)) +;; ;; (lnum 0)) +;; ;; (if (not (eof-object? inl)) +;; ;; (let ((mlst (string-match server-rx inl)) +;; ;; (dbprep (string-match dbprep-rx inl))) +;; ;; (if dbprep (set! dbprep-found 1)) +;; ;; (if (not mlst) +;; ;; (if (< lnum 500) ;; give up if more than 500 lines of server log read +;; ;; (loop (read-line)(+ lnum 1)) +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server info from first 500 lines of " logf ) +;; ;; bad-dat)) +;; ;; (match mlst +;; ;; ((_ host port start server-id pid) +;; ;; (list host +;; ;; (string->number port) +;; ;; (string->number start) +;; ;; server-id +;; ;; (string->number pid))) +;; ;; (else +;; ;; (debug:print 0 *current-log-port* "ERROR: did not recognise SERVER line info "mlst) +;; ;; bad-dat)))) +;; ;; (begin +;; ;; (if dbprep-found +;; ;; (begin +;; ;; (debug:print-info 2 *default-log-port* "Server is in dbprep at " (common:human-time)) +;; ;; (thread-sleep! 0.5)) ;; was 25 sec but that blocked things from starting? +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds)))) +;; ;; bad-dat)))))))) +;; ;; +;; ;; ;; ;; get a list of servers from the log files, with all relevant data +;; ;; ;; ;; ( mod-time host port start-time pid ) +;; ;; ;; ;; +;; ;; ;; (define (server:get-list areapath #!key (limit #f)) +;; ;; ;; (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) +;; ;; ;; (day-seconds (* 24 60 60))) +;; ;; ;; ;; if the directory exists continue to get the list +;; ;; ;; ;; otherwise attempt to create the logs dir and then +;; ;; ;; ;; continue +;; ;; ;; (if (if (directory-exists? (conc areapath "/logs")) +;; ;; ;; '() +;; ;; ;; (if (file-write-access? areapath) +;; ;; ;; (begin +;; ;; ;; (condition-case +;; ;; ;; (create-directory (conc areapath "/logs") #t) +;; ;; ;; (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) +;; ;; ;; (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn))) +;; ;; ;; (directory-exists? (conc areapath "/logs"))) +;; ;; ;; '())) +;; ;; ;; +;; ;; ;; ;; Get the list of server logs. +;; ;; ;; (let* ( +;; ;; ;; ;; For some reason, when I uncomment the below line, ext-tests sometimes starts 1000's of servers. +;; ;; ;; ;; (exiting-servers (system (conc "bash -c 'rm -f `grep -il exiting " areapath "/logs/server-*-*.log 2> /dev/null`'"))) +;; ;; ;; (server-logs (glob (conc areapath "/logs/server-*-*.log"))) +;; ;; ;; (num-serv-logs (length server-logs))) +;; ;; ;; (if (or (null? server-logs) (= num-serv-logs 0)) +;; ;; ;; (let () +;; ;; ;; (debug:print 2 *default-log-port* "There are no servers running at " (common:human-time)) +;; ;; ;; '() +;; ;; ;; ) +;; ;; ;; (let loop ((hed (string-chomp (car server-logs))) +;; ;; ;; (tal (cdr server-logs)) +;; ;; ;; (res '())) +;; ;; ;; (let* ((mod-time (handle-exceptions +;; ;; ;; exn +;; ;; ;; (begin +;; ;; ;; (debug:print 0 *default-log-port* "server:get-list: failed to get modification time on " hed ", exn=" exn) +;; ;; ;; (current-seconds)) ;; 0 +;; ;; ;; (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted +;; ;; ;; (down-time (- (current-seconds) mod-time)) +;; ;; ;; (serv-dat (if (or (< num-serv-logs 10) +;; ;; ;; (< down-time 900)) ;; day-seconds)) +;; ;; ;; (server:logf-get-start-info hed) +;; ;; ;; '())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at +;; ;; ;; (serv-rec (cons mod-time serv-dat)) +;; ;; ;; (fmatch (string-match fname-rx hed)) +;; ;; ;; (pid (if fmatch (string->number (list-ref fmatch 2)) #f)) +;; ;; ;; (new-res (if (null? serv-dat) +;; ;; ;; res +;; ;; ;; (cons (append serv-rec (list pid)) res)))) ;; any changes to number of elements in new-res will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which uses match let +;; ;; ;; (if (null? tal) +;; ;; ;; (if (and limit +;; ;; ;; (> (length new-res) limit)) +;; ;; ;; new-res ;; (take new-res limit) <= need intelligent sorting before this will work +;; ;; ;; new-res) +;; ;; ;; (loop (string-chomp (car tal)) (cdr tal) new-res))))))))) +;; ;; +;; ;; #;(define (server:get-num-alive srvlst) +;; ;; (let ((num-alive 0)) +;; ;; (for-each +;; ;; (lambda (server) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server start-time and/or mod-time from " server ", exn=" exn)) +;; ;; (match-let (((mod-time host port start-time server-id pid) +;; ;; server)) +;; ;; (let* ((uptime (- (current-seconds) mod-time)) +;; ;; (runtime (if start-time +;; ;; (- mod-time start-time) +;; ;; 0))) +;; ;; (if (< uptime 5)(set! num-alive (+ num-alive 1))))))) +;; ;; srvlst) +;; ;; num-alive)) +;; ;; +;; ;; ;; ;; given a list of servers get a list of valid servers, i.e. at least +;; ;; ;; ;; 10 seconds old, has started and is less than 1 hour old and is +;; ;; ;; ;; active (i.e. mod-time < 10 seconds +;; ;; ;; ;; +;; ;; ;; ;; mod-time host port start-time pid +;; ;; ;; ;; +;; ;; ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off +;; ;; ;; ;; and servers should stick around for about two hours or so. +;; ;; ;; ;; +;; ;; ;; (define (server:get-best srvlst) +;; ;; ;; (let* ((nums (server:get-num-servers)) +;; ;; ;; (now (current-seconds)) +;; ;; ;; (slst (sort +;; ;; ;; (filter (lambda (rec) +;; ;; ;; (if (and (list? rec) +;; ;; ;; (> (length rec) 2)) +;; ;; ;; (let ((start-time (list-ref rec 3)) +;; ;; ;; (mod-time (list-ref rec 0))) +;; ;; ;; ;; (print "start-time: " start-time " mod-time: " mod-time) +;; ;; ;; (and start-time mod-time +;; ;; ;; (> (- now start-time) 0) ;; been running at least 0 seconds +;; ;; ;; (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds +;; ;; ;; (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set +;; ;; ;; (< (- now start-time) +;; ;; ;; (+ (- (string->number (configf:lookup *configdat* "server" "runtime")) +;; ;; ;; 180) +;; ;; ;; (random 360)))) ;; under one hour running time +/- 180 +;; ;; ;; )) +;; ;; ;; #f)) +;; ;; ;; srvlst) +;; ;; ;; (lambda (a b) +;; ;; ;; (< (list-ref a 3) +;; ;; ;; (list-ref b 3)))))) +;; ;; ;; (if (> (length slst) nums) +;; ;; ;; (take slst nums) +;; ;; ;; slst))) +;; ;; +;; ;; ;; ;; switch from server:get-list to server:get-servers-info +;; ;; ;; ;; +;; ;; ;; (define (server:get-first-best areapath) +;; ;; ;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; ;; ;; (if (and srvrs +;; ;; ;; (not (null? srvrs))) +;; ;; ;; (car srvrs) +;; ;; ;; #f))) +;; ;; ;; +;; ;; ;; (define (server:get-rand-best areapath) +;; ;; ;; (let ((srvrs (server:get-best (server:get-list areapath)))) +;; ;; ;; (if (and (list? srvrs) +;; ;; ;; (not (null? srvrs))) +;; ;; ;; (let* ((len (length srvrs)) +;; ;; ;; (idx (random len))) +;; ;; ;; (list-ref srvrs idx)) +;; ;; ;; #f))) +;; ;; +;; ;; (define (server:record->id servr) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server id from " servr ", exn=" exn) +;; ;; #f) +;; ;; (match-let (((host port start-time server-id pid) +;; ;; servr)) +;; ;; (if server-id +;; ;; server-id +;; ;; #f)))) +;; ;; +;; ;; (define (server:record->url servr) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get server url from " servr ", exn=" exn) +;; ;; #f) +;; ;; (match-let (((host port start-time server-id pid) +;; ;; servr)) +;; ;; (if (and host port) +;; ;; (conc host ":" port) +;; ;; #f)))) +;; ;; +;; ;; +;; ;; ;; if server-start-last exists, and wasn't old enough, wait + 1, then call this function recursively until it is old enough. +;; ;; ;; if it is old enough, overwrite it and wait 0.25 seconds. +;; ;; ;; if it then has the wrong server key, wait + 1 and call this function recursively. +;; ;; ;; +;; ;; #;(define (server:wait-for-server-start-last-flag areapath) +;; ;; (let* ((start-flag (conc areapath "/logs/server-start-last")) +;; ;; ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) +;; ;; (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) +;; ;; (server-key (conc (get-host-name) "-" (current-process-id)))) +;; ;; (if (file-exists? start-flag) +;; ;; (let* ((fmodtime (file-modification-time start-flag)) +;; ;; (delta (- (current-seconds) fmodtime)) +;; ;; (old-enough (> delta idletime)) +;; ;; (new-server-key "")) +;; ;; ;; write start-flag file, wait 0.25s, then if previously the start-flag file was older than seconds, and the new file still has the same server key as you just wrote, return #t. +;; ;; ;; the intention is to make sure nfs can read the file we just wrote, and make sure it was written by us, and not another process. +;; ;; (if (and old-enough +;; ;; (begin +;; ;; (debug:print-info 2 *default-log-port* "Writing " start-flag) +;; ;; (with-output-to-file start-flag (lambda () (print server-key))) +;; ;; (thread-sleep! 0.25) +;; ;; (set! new-server-key (with-input-from-file start-flag (lambda () (read-line)))) +;; ;; (equal? server-key new-server-key))) +;; ;; #t +;; ;; ;; If either of the above conditions is not true, print a "Gating server start" message, wait + 1, then call this function recursively. +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Gating server start, last start: " +;; ;; (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if old-enough "another job started a server" "too soon to start another server")) +;; ;; +;; ;; (thread-sleep! ( + 1 idletime)) +;; ;; (server:wait-for-server-start-last-flag areapath))))))) +;; ;; +;; ;; ;; oldest server alive determines host then choose random of youngest +;; ;; ;; five servers on that host +;; ;; ;; +;; ;; (define (server:get-servers-info areapath) +;; ;; ;; (assert *toppath* "FATAL: server:get-servers-info called before *toppath* has been set.") +;; ;; (let* ((servinfodir (server:get-servinfo-dir areapath))) ;; (conc *toppath*"/.servinfo"))) +;; ;; (if (not (file-exists? servinfodir)) +;; ;; (create-directory servinfodir)) +;; ;; (let* ((allfiles (glob (conc servinfodir"/*"))) +;; ;; (res (make-hash-table))) +;; ;; (for-each +;; ;; (lambda (f) +;; ;; (let* ((hostport (pathname-strip-directory f)) +;; ;; (serverdat (server:logf-get-start-info f))) +;; ;; (match serverdat +;; ;; ((host port start server-id pid) +;; ;; (if (and host port start server-id pid) +;; ;; (hash-table-set! res hostport serverdat) +;; ;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))) +;; ;; (else +;; ;; (debug:print-info 2 *default-log-port* "bad server info for "f": "serverdat))))) +;; ;; allfiles) +;; ;; res))) +;; ;; +;; ;; ;; check the .servinfo directory, are there other servers running on this +;; ;; ;; or another host? +;; ;; ;; +;; ;; ;; returns #t => ok to start another server +;; ;; ;; #f => not ok to start another server +;; ;; ;; +;; ;; (define (server:minimal-check areapath) +;; ;; (server:clean-up-old areapath) +;; ;; (let* ((srvdir (server:get-servinfo-dir areapath)) ;; (conc areapath"/.servinfo")) +;; ;; (servrs (glob (conc srvdir"/*"))) +;; ;; (thishostip (server:get-best-guess-address (get-host-name))) +;; ;; (thisservrs (glob (conc srvdir"/"thishostip":*"))) +;; ;; (homehostinf (server:choose-server areapath 'homehost)) +;; ;; (havehome (car homehostinf)) +;; ;; (wearehome (cdr homehostinf))) +;; ;; (debug:print-info 0 *default-log-port* thishostip", have homehost: "havehome", we are homehost: "wearehome +;; ;; ", numservers: "(length thisservrs)) +;; ;; (cond +;; ;; ((not havehome) #t) ;; no homehost yet, go for it +;; ;; ((and havehome wearehome (< (length thisservrs) 20)) #t) ;; we are home and less than 20 servers, ok to start another +;; ;; ((and havehome (not wearehome)) #f) ;; we are not the home host +;; ;; ((and havehome wearehome (>= (length thisservrs) 20)) #f) ;; have enough running +;; ;; (else +;; ;; (debug:print 0 *default-log-port* "WARNING: Unrecognised scenario, servrs="servrs", thishostip="thishostip", thisservrs="thisservrs) +;; ;; #t)))) +;; ;; +;; ;; +;; ;; (define server-last-start 0) +;; ;; +;; ;; +;; ;; ;; oldest server alive determines host then choose random of youngest +;; ;; ;; five servers on that host +;; ;; ;; +;; ;; ;; mode: +;; ;; ;; best - get best server (random of newest five) +;; ;; ;; home - get home host based on oldest server +;; ;; ;; info - print info +;; ;; (define (server:choose-server areapath #!optional (mode 'best)) +;; ;; ;; age is current-starttime +;; ;; ;; find oldest alive +;; ;; ;; 1. sort by age ascending and ping until good +;; ;; ;; find alive rand from youngest +;; ;; ;; 1. sort by age descending +;; ;; ;; 2. take five +;; ;; ;; 3. check alive, discard if not and repeat +;; ;; ;; first we clean up old server files +;; ;; (server:clean-up-old areapath) +;; ;; (let* ((since-last (- (current-seconds) server-last-start)) +;; ;; (server-start-delay 10)) +;; ;; (if ( < (- (current-seconds) server-last-start) 10 ) +;; ;; (begin +;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: last server start less than " server-start-delay " seconds ago. Sleeping " server-start-delay " seconds") +;; ;; (thread-sleep! server-start-delay) +;; ;; ) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: seconds since last server start: " (- (current-seconds) server-last-start)) +;; ;; ) +;; ;; ) +;; ;; (let* ((serversdat (server:get-servers-info areapath)) +;; ;; (servkeys (hash-table-keys serversdat)) +;; ;; (by-time-asc (if (not (null? servkeys)) ;; NOTE: Oldest is last +;; ;; (sort servkeys ;; list of "host:port" +;; ;; (lambda (a b) +;; ;; (>= (list-ref (hash-table-ref serversdat a) 2) +;; ;; (list-ref (hash-table-ref serversdat b) 2)))) +;; ;; '()))) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: serversdat: " serversdat) +;; ;; (debug:print 2 *default-log-port* "server:choose-server: servkeys: " servkeys) +;; ;; (if (not (null? by-time-asc)) +;; ;; (let* ((oldest (last by-time-asc)) +;; ;; (oldest-dat (hash-table-ref serversdat oldest)) +;; ;; (host (list-ref oldest-dat 0)) +;; ;; (all-valid (filter (lambda (x) +;; ;; (equal? host (list-ref (hash-table-ref serversdat x) 0))) +;; ;; by-time-asc)) +;; ;; (best-ten (lambda () +;; ;; (if (> (length all-valid) 11) +;; ;; (take (drop-right all-valid 1) 10) ;; remove the oldest from consideration so it can age out +;; ;; (if (> (length all-valid) 8) +;; ;; (drop-right all-valid 1) +;; ;; all-valid)))) +;; ;; (names->dats (lambda (names) +;; ;; (map (lambda (x) +;; ;; (hash-table-ref serversdat x)) +;; ;; names))) +;; ;; (am-home? (lambda () +;; ;; (let* ((currhost (get-host-name)) +;; ;; (bestadrs (server:get-best-guess-address currhost))) +;; ;; (or (equal? host currhost) +;; ;; (equal? host bestadrs)))))) +;; ;; (case mode +;; ;; ((info) +;; ;; (print "oldest: "oldest-dat", selected host: "host", all-valid: "all-valid) +;; ;; (print "youngest: "(hash-table-ref serversdat (car all-valid)))) +;; ;; ((home) host) +;; ;; ((homehost) (cons host (am-home?))) ;; shut up old code +;; ;; ((home?) (am-home?)) +;; ;; ((best-ten)(names->dats (best-ten))) +;; ;; ((all-valid)(names->dats all-valid)) +;; ;; ((best) (let* ((best-ten (best-ten)) +;; ;; (len (length best-ten))) +;; ;; (hash-table-ref serversdat (list-ref best-ten (random len))))) +;; ;; ((count)(length all-valid)) +;; ;; (else +;; ;; (debug:print 0 *default-log-port* "ERROR: invalid command "mode) +;; ;; #f))) +;; ;; (begin +;; ;; (server:run areapath) +;; ;; (set! server-last-start (current-seconds)) +;; ;; ;; (thread-sleep! 3) +;; ;; (case mode +;; ;; ((homehost) (cons #f #f)) +;; ;; (else #f)))))) +;; ;; +;; ;; (define (server:get-servinfo-dir areapath) +;; ;; (let* ((spath (conc areapath"/.servinfo"))) +;; ;; (if (not (file-exists? spath)) +;; ;; (create-directory spath #t)) +;; ;; spath)) +;; ;; +;; ;; (define (server:clean-up-old areapath) +;; ;; ;; any server file that has not been touched in ten minutes is effectively dead +;; ;; (let* ((sfiles (glob (conc (server:get-servinfo-dir areapath)"/*")))) +;; ;; (for-each +;; ;; (lambda (sfile) +;; ;; (let* ((modtime (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print 0 *default-log-port* "WARNING: failed to get modification file for "sfile) +;; ;; (current-seconds)) +;; ;; (file-modification-time sfile)))) +;; ;; (if (and (number? modtime) +;; ;; (> (- (current-seconds) modtime) +;; ;; 600)) +;; ;; (begin +;; ;; (debug:print 0 *default-log-port* "WARNING: found old server info file "sfile", removing it.") +;; ;; (handle-exceptions +;; ;; exn +;; ;; (debug:print 0 *default-log-port* "WARNING: failed to delete old server info file "sfile) +;; ;; (delete-file sfile)))))) +;; ;; sfiles))) +;; ;; +;; ;; ;; would like to eventually get rid of this +;; ;; ;; +;; ;; (define (common:on-homehost?) +;; ;; (server:choose-server *toppath* 'home?)) +;; ;; +;; ;; ;; kind start up of server, wait before allowing another server for a given +;; ;; ;; area to be launched +;; ;; ;; +;; ;; (define (server:kind-run areapath) +;; ;; ;; look for $MT_RUN_AREA_HOME/logs/server-start-last +;; ;; ;; and wait for it to be at least seconds old +;; ;; ;; (server:wait-for-server-start-last-flag areapath) +;; ;; (let loop () +;; ;; (if (> (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f)) 2) +;; ;; (begin +;; ;; (if (common:low-noise-print 30 "our-host-load") +;; ;; (debug:print 0 *default-log-port* "WARNING: system load is high, waiting to start server.")) +;; ;; (loop)))) +;; ;; (if (< (server:choose-server areapath 'count) 20) +;; ;; (server:run areapath)) +;; ;; #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? +;; ;; (let* ((lock-file (conc areapath "/logs/server-start.lock"))) +;; ;; (let* ((start-flag (conc areapath "/logs/server-start-last"))) +;; ;; (common:simple-file-lock-and-wait lock-file expire-time: 25) +;; ;; (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) +;; ;; (system (conc "touch " start-flag)) ;; lazy but safe +;; ;; (server:run areapath) +;; ;; (thread-sleep! 20) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". +;; ;; (common:simple-file-release-lock lock-file))) +;; ;; (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another."))) +;; ;; +;; ;; ;; this one seems to be the general entry point +;; ;; ;; +;; ;; (define (server:start-and-wait areapath #!key (timeout 60)) +;; ;; (let ((give-up-time (+ (current-seconds) timeout))) +;; ;; (let loop ((server-info (server:check-if-running areapath)) +;; ;; (try-num 0)) +;; ;; (if (or server-info +;; ;; (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. +;; ;; (server:record->url server-info) +;; ;; (let* ( (servers (server:choose-server areapath 'all-valid)) +;; ;; (num-ok (if servers (length (server:choose-server areapath 'all-valid)) 0))) +;; ;; (if (and (> try-num 0) ;; first time through simply wait a little while then try again +;; ;; (< num-ok 1)) ;; if there are no decent candidates for servers then try starting a new one +;; ;; (server:run areapath)) +;; ;; (thread-sleep! 5) +;; ;; (loop (server:check-if-running areapath) +;; ;; (+ try-num 1))))))) +;; ;; +;; ;; (define (server:get-num-servers #!key (numservers 2)) +;; ;; (let ((ns (string->number +;; ;; (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) +;; ;; (or ns numservers))) +;; ;; +;; ;; ;; no longer care if multiple servers are started by accident. older servers will drop off in time. +;; ;; ;; +;; ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) +;; ;; (let* ((ns (server:get-num-servers)) ;; get the setting the for maximum number of servers allowed +;; ;; (servers (server:choose-server areapath 'best-ten))) ;; (server:get-best (server:get-list areapath)))) +;; ;; (if (or (and servers +;; ;; (null? servers)) +;; ;; (not servers)) +;; ;; ;; (and (list? servers) +;; ;; ;; (< (length servers) (+ 1 (random ns))))) ;; somewhere between 1 and numservers +;; ;; #f +;; ;; (let loop ((hed (car servers)) +;; ;; (tal (cdr servers))) +;; ;; (let ((res (server:check-server hed))) +;; ;; (if res +;; ;; hed +;; ;; (if (null? tal) +;; ;; #f +;; ;; (loop (car tal)(cdr tal))))))))) +;; ;; +;; ;; ;; ping the given server +;; ;; ;; +;; ;; (define (server:check-server server-record) +;; ;; (let* ((server-url (server:record->url server-record)) +;; ;; (server-id (server:record->id server-record)) +;; ;; (res (server:ping server-url server-id))) +;; ;; (if res +;; ;; server-url +;; ;; #f))) +;; ;; +;; ;; (define (server:kill servr) +;; ;; (handle-exceptions +;; ;; exn +;; ;; (begin +;; ;; (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " servr ", exn=" exn) +;; ;; #f) +;; ;; (match-let (((mod-time hostname port start-time server-id pid) +;; ;; servr)) +;; ;; (tasks:kill-server hostname pid)))) +;; ;; +;; ;; ;; called in megatest.scm, host-port is string hostname:port +;; ;; ;; +;; ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running +;; ;; ;; in the same process as the server. +;; ;; ;; +;; ;; (define (server:ping host:port server-id #!key (do-exit #f)) +;; ;; (let* ((host-port (cond +;; ;; ((string? host:port) +;; ;; (let ((slst (string-split host:port ":"))) +;; ;; (if (eq? (length slst) 2) +;; ;; (list (car slst)(string->number (cadr slst))) +;; ;; #f))) +;; ;; (else +;; ;; #f)))) +;; ;; (cond +;; ;; ((and (list? host-port) +;; ;; (eq? (length host-port) 2)) +;; ;; (let* ((myrunremote (make-remote)) +;; ;; (iface (car host-port)) +;; ;; (port (cadr host-port)) +;; ;; (server-dat (client:connect iface port server-id myrunremote)) +;; ;; (login-res (rmt:login-no-auto-client-setup myrunremote))) +;; ;; (if (and (list? login-res) +;; ;; (car login-res)) +;; ;; (begin +;; ;; ;; (print "LOGIN_OK") +;; ;; (if do-exit (exit 0)) +;; ;; #t) +;; ;; (begin +;; ;; ;; (print "LOGIN_FAILED") +;; ;; (if do-exit (exit 1)) +;; ;; #f)))) +;; ;; (else +;; ;; (if host:port +;; ;; (debug:print 0 *default-log-port* "ERROR: bad host:port "host:port)) +;; ;; (if do-exit +;; ;; (exit 1) +;; ;; #f))))) +;; ;; +;; ;; ;; run ping in separate process, safest way in some cases +;; ;; ;; +;; ;; (define (server:ping-server ifaceport) +;; ;; (with-input-from-pipe +;; ;; (conc (common:get-megatest-exe) " -ping " ifaceport) +;; ;; (lambda () +;; ;; (let loop ((inl (read-line)) +;; ;; (res "NOREPLY")) +;; ;; (if (eof-object? inl) +;; ;; (case (string->symbol res) +;; ;; ((NOREPLY) #f) +;; ;; ((LOGIN_OK) #t) +;; ;; (else #f)) +;; ;; (loop (read-line) inl)))))) +;; ;; +;; ;; ;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). +;; ;; ;; +;; ;; (define (server:login toppath) +;; ;; (lambda (toppath) +;; ;; (set! *db-last-access* (current-seconds)) ;; might not be needed. +;; ;; (if (equal? *toppath* toppath) +;; ;; #t +;; ;; #f))) +;; ;; +;; ;; ;; timeout is hms string: 1h 5m 3s, default is 1 minute +;; ;; ;; This is currently broken. Just use the number of hours with no unit. +;; ;; ;; Default is 60 seconds. +;; ;; ;; +;; ;; (define (server:expiration-timeout) +;; ;; (let ((tmo (configf:lookup *configdat* "server" "timeout"))) +;; ;; (if (and (string? tmo) +;; ;; (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below +;; ;; (* 3600 (string->number tmo)) +;; ;; 600))) +;; ;; +;; ;; (define (server:get-best-guess-address hostname) +;; ;; (let ((res #f)) +;; ;; (for-each +;; ;; (lambda (adr) +;; ;; (if (not (eq? (u8vector-ref adr 0) 127)) +;; ;; (set! res adr))) +;; ;; ;; NOTE: This can fail when there is no mention of the host in /etc/hosts. FIXME +;; ;; (vector->list (hostinfo-addresses (hostname->hostinfo hostname)))) +;; ;; (string-intersperse +;; ;; (map number->string +;; ;; (u8vector->list +;; ;; (if res res (hostname->ip hostname)))) "."))) +;; ;; +;; ;; ;; (define server:sync-lock-token "SERVER_SYNC_LOCK") +;; ;; ;; (define (server:release-sync-lock) +;; ;; ;; (db:no-sync-del! *no-sync-db* server:sync-lock-token)) +;; ;; ;; (define (server:have-sync-lock?) +;; ;; ;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token)) +;; ;; ;; (have-lock? (car have-lock-pair)) +;; ;; ;; (lock-time (cdr have-lock-pair)) +;; ;; ;; (lock-age (- (current-seconds) lock-time))) +;; ;; ;; (cond +;; ;; ;; (have-lock? #t) +;; ;; ;; ((>lock-age +;; ;; ;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) +;; ;; ;; (server:release-sync-lock) +;; ;; ;; (server:have-sync-lock?)) +;; ;; ;; (else #f)))) +;; ;; +;; ;; ;; moving this here as it needs access to db and cannot be in common. +;; ;; ;; +;; ;; +;; ;; (define (server:get-bruteforce-syncer dbstruct #!key (fork-to-background #f) (persist-until-sync #f)) +;; ;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!") +;; ;; (lambda () +;; ;; (debug:print "WARNING: bruteforce-syncer is called but has been disabled!")) +;; ;; #;(let* ((sqlite-exe (or (get-environment-variable "MT_SQLITE3_EXE"))) ;; defined in cfg.sh +;; ;; (sync-log (or (args:get-arg "-sync-log") (conc *toppath* "/logs/sync-" (current-process-id) "-" (get-host-name) ".log"))) +;; ;; (tmp-area (common:get-db-tmp-area)) +;; ;; (tmp-db (conc tmp-area "/megatest.db")) +;; ;; (staging-file (conc *toppath* "/.megatest.db")) +;; ;; (mtdbfile (conc *toppath* "/megatest.db")) +;; ;; (lockfile (common:get-sync-lock-filepath)) +;; ;; (sync-cmd-core (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) +;; ;; (sync-cmd (if fork-to-background +;; ;; (conc "/usr/bin/env NBFAKE_LOG="*toppath*"/logs/last-server-sync-"(current-process-id)".log nbfake \""sync-cmd-core" && /bin/mv -f " staging-file " " mtdbfile" \"") +;; ;; sync-cmd-core)) +;; ;; (default-min-intersync-delay 2) +;; ;; (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: default-min-intersync-delay)) +;; ;; (default-duty-cycle 0.1) +;; ;; (duty-cycle (configf:lookup-number *configdat* "server" "sync-duty-cycle" default: default-duty-cycle)) +;; ;; (last-sync-seconds 10) ;; we will adjust this to a measurement and delay last-sync-seconds * (1 - duty-cycle) +;; ;; (calculate-off-time (lambda (work-duration duty-cycle) +;; ;; (* (/ (- 1 duty-cycle) duty-cycle) last-sync-seconds))) +;; ;; (off-time min-intersync-delay) ;; adjusted in closure below. +;; ;; (do-a-sync +;; ;; (lambda () +;; ;; (BB> "Start do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync) +;; ;; (let* ((finalres +;; ;; (let retry-loop ((num-tries 0)) +;; ;; (if (common:simple-file-lock lockfile) +;; ;; (begin +;; ;; (cond +;; ;; ((not (or fork-to-background persist-until-sync)) +;; ;; (debug:print 0 *default-log-port* "INFO: syncer thread sleeping for max of (server.minimum-intersync-delay="min-intersync-delay +;; ;; " , off-time="off-time" seconds ]") +;; ;; (thread-sleep! (max off-time min-intersync-delay))) +;; ;; (else +;; ;; (debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit..."))) +;; ;; +;; ;; (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) +;; ;; (common:snapshot-file mtdbfile subdir: ".db-snapshot")) +;; ;; (delete-file* staging-file) +;; ;; (let* ((start-time (current-milliseconds)) +;; ;; (res (system sync-cmd)) +;; ;; (dbbackupfile (conc mtdbfile ".backup")) +;; ;; (res2 +;; ;; (cond +;; ;; ((eq? 0 res ) +;; ;; (handle-exceptions +;; ;; exn +;; ;; #f +;; ;; (if (file-exists? dbbackupfile) +;; ;; (delete-file* dbbackupfile) +;; ;; ) +;; ;; (if (eq? 0 (file-size sync-log)) +;; ;; (delete-file* sync-log)) +;; ;; (system (conc "/bin/mv " staging-file " " mtdbfile)) +;; ;; +;; ;; (set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000)) +;; ;; (set! off-time (calculate-off-time +;; ;; last-sync-seconds +;; ;; (cond +;; ;; ((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1)) +;; ;; duty-cycle) +;; ;; (else +;; ;; (debug:print 0 *default-log-port* "WARNING: ["(common:human-time)"] server.sync-duty-cycle is invalid. Should be a number between 0 and 1, but "duty-cycle" was specified. Using default value: "default-duty-cycle) +;; ;; default-duty-cycle)))) +;; ;; +;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec") +;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" SYNC took "last-sync-seconds" sec ; with duty-cycle of "duty-cycle" off time is now "off-time) +;; ;; 'sync-completed)) +;; ;; (else +;; ;; (system (conc "/bin/cp "sync-log" "sync-log".fail")) +;; ;; (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") +;; ;; (if (file-exists? (conc mtdbfile ".backup")) +;; ;; (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))) +;; ;; #f)))) +;; ;; (common:simple-file-release-lock lockfile) +;; ;; (BB> "released lockfile: " lockfile) +;; ;; (when (common:file-exists? lockfile) +;; ;; (BB> "DID NOT ACTUALLY RELEASE LOCKFILE")) +;; ;; res2) ;; end let +;; ;; );; end begin +;; ;; ;; else +;; ;; (cond +;; ;; (persist-until-sync +;; ;; (thread-sleep! 1) +;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; we're in a fork-to-background so we need to succeed. Let's wait a jiffy and and try again. num-tries="num-tries" (waiting for lockfile="lockfile" to disappear)") +;; ;; (retry-loop (add1 num-tries))) +;; ;; (else +;; ;; (thread-sleep! (max off-time (+ last-sync-seconds min-intersync-delay))) +;; ;; (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] pid="(current-process-id)" other SYNC in progress; not syncing.") +;; ;; 'parallel-sync-in-progress)) +;; ;; ) ;; end if got lockfile +;; ;; ) +;; ;; )) +;; ;; (BB> "End do-a-sync with fork-to-background="fork-to-background" persist-until-sync="persist-until-sync" and result="finalres) +;; ;; finalres) +;; ;; ) ;; end lambda +;; ;; )) +;; ;; do-a-sync)) +;; ;; +;; ;; + +) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -22,11 +22,11 @@ (import (prefix sqlite3 sqlite3:)) (declare (unit tasks)) (declare (uses dbfile)) (declare (uses db)) -(declare (uses rmt)) +(declare (uses rmtmod)) (declare (uses common)) (declare (uses pgdb)) (import dbfile) ;; (import pgdb) ;; pgdb is a module Index: tcmt.scm ================================================================== --- tcmt.scm +++ tcmt.scm @@ -27,11 +27,11 @@ (use trace) ;; (trace-call-sites #t) (declare (uses margs)) -(declare (uses rmt)) +(declare (uses rmtmod)) (declare (uses common)) ;; (declare (uses megatest-version)) (include "megatest-version.scm") (include "megatest-fossil-hash.scm") Index: tdb.scm ================================================================== --- tdb.scm +++ tdb.scm @@ -29,11 +29,11 @@ (declare (unit tdb)) (declare (uses common)) (declare (uses keys)) (declare (uses ods)) -(declare (uses client)) +(declare (uses clientmod)) (declare (uses mt)) (declare (uses db)) (include "common_records.scm") (include "db_records.scm") Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -30,11 +30,11 @@ (declare (uses commonmod)) ;; (declare (uses dcommon)) ;; needed for the steps processing (declare (uses items)) (declare (uses runconfig)) ;; (declare (uses sdb)) -(declare (uses server)) +(declare (uses servermod)) ;;(declare (uses stml2)) (use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking tcp directory-utils) (import (prefix sqlite3 sqlite3:)) (import commonmod) Index: tree.scm ================================================================== --- tree.scm +++ tree.scm @@ -30,11 +30,11 @@ (declare (uses margs)) (declare (uses launch)) ;; (declare (uses megatest-version)) (declare (uses gutils)) (declare (uses db)) -(declare (uses server)) +;; (declare (uses server)) ;; (declare (uses synchash)) (declare (uses dcommon)) (include "megatest-version.scm") (include "common_records.scm") ADDED ulex/dbmgr.scm Index: ulex/dbmgr.scm ================================================================== --- /dev/null +++ ulex/dbmgr.scm @@ -0,0 +1,1131 @@ +;;====================================================================== +;; Copyright 2022, Matthew Welland. +;; +;; This file is part of Megatest. +;; +;; Megatest is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. +;; +;; Megatest is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with Megatest. If not, see . + +;;====================================================================== + +(declare (unit dbmgrmod)) + +(declare (uses ulex)) +(declare (uses apimod)) +(declare (uses pkts)) +(declare (uses commonmod)) +(declare (uses dbmod)) +(declare (uses mtargs)) +(declare (uses portloggermod)) +(declare (uses debugprint)) + +(module dbmgrmod + * + +(import scheme + chicken.base + chicken.condition + chicken.file + chicken.format + chicken.port + chicken.process + chicken.process-context + chicken.process-context.posix + chicken.sort + chicken.string + chicken.time + + (prefix sqlite3 sqlite3:) + matchable + md5 + message-digest + regex + s11n + srfi-1 + srfi-18 + srfi-69 + system-information + typed-records + + pkts + ulex + + commonmod + apimod + dbmod + debugprint + (prefix mtargs args:) + portloggermod + ) + +;; Configurations for server +;; (tcp-buffer-size 2048) +;; (max-connections 2048) + +;; info about me as a listener and my connections to db servers +;; stored (for now) in *db-serv-info* +;; +(defstruct servdat + (host #f) + (port #f) + (uuid #f) + (dbfile #f) + (uconn #f) ;; this is the listener *FOR THIS PROCESS* + (mode #f) + (status 'starting) + (trynum 0) ;; count the number of ports we've tried + (conns (make-hash-table)) ;; apath/dbname => conndat + ) + +(define *db-serv-info* (make-servdat)) + +(define (servdat->url sdat) + (conc (servdat-host sdat)":"(servdat-port sdat))) + +;; db servers contact info +;; +(defstruct conndat + (apath #f) + (dbname #f) + (fullname #f) + (hostport #f) + (ipaddr #f) + (port #f) + (srvpkt #f) + (srvkey #f) + (lastmsg 0) + (expires 0)) + +(define *srvpktspec* + `((server (host . h) + (port . p) + (servkey . k) + (pid . i) + (ipaddr . a) + (dbpath . d)))) + +;;====================================================================== +;; S U P P O R T F U N C T I O N S +;;====================================================================== + +;; set up the api proc, seems like there should be a better place for this? +;; +;; IS THIS NEEDED ANYMORE? TODO - REMOVE IF POSSIBLE +;; +;; (define api-proc (make-parameter conc)) +;; (api-proc api:execute-requests) + +;; do we have a connection to apath dbname and +;; is it not expired? then return it +;; +;; else setup a connection +;; +;; if that fails, return '(#f "some reason") ;; NB// convert to raising an exception +;; +(define (rmt:get-conn remdat apath dbname) + (let* ((fullname (db:dbname->path apath dbname))) + (hash-table-ref/default (servdat-conns remdat) fullname #f))) + +(define (rmt:drop-conn remdat apath dbname) + (let* ((fullname (db:dbname->path apath dbname))) + (hash-table-delete! (servdat-conns remdat) fullname))) + +(define (rmt:find-main-server uconn apath dbname) + (let* ((pktsdir (get-pkts-dir apath)) + (all-srvpkts (get-all-server-pkts pktsdir *srvpktspec*)) + (viable-srvs (get-viable-servers all-srvpkts dbname))) + (get-the-server uconn apath viable-srvs))) + + +(define *connstart-mutex* (make-mutex)) +(define *last-main-start* 0) + +;; looks for a connection to main, returns if have and not exired +;; creates new otherwise +;; +;; connections for other servers happens by requesting from main +;; +;; TODO: This is unnecessarily re-creating the record in the hash table +;; +(define (rmt:open-main-connection remdat apath) + (let* ((fullpath (db:dbname->path apath ".db/main.db")) + (conns (servdat-conns remdat)) + (conn (rmt:get-conn remdat apath ".db/main.db")) ;; (hash-table-ref/default conns fullpath #f)) ;; TODO - create call for this + (start-rmt:run (lambda () + (let* ((th1 (make-thread (lambda ()(rmt:run (get-host-name))) "non-db mode server"))) + (thread-start! th1) + (thread-sleep! 1) + (let loop ((count 0)) + (assert (< count 30) "FATAL: responder failed to initialize in rmt:open-main-connection") + (if (or (not *db-serv-info*) + (not (servdat-uconn *db-serv-info*))) + (begin + (thread-sleep! 1) + (loop (+ count 1))) + (begin + (servdat-mode-set! *db-serv-info* 'non-db) + (servdat-uconn *db-serv-info*))))))) + (myconn (servdat-uconn *db-serv-info*))) + (cond + ((not myconn) + (start-rmt:run) + (rmt:open-main-connection remdat apath)) + ((and conn ;; conn is NOT a socket, just saying ... + (< (current-seconds) (conndat-expires conn))) + #t) ;; we are current and good to go - we'll deal elsewhere with a server that was killed or died + ((and conn + (>= (current-seconds)(conndat-expires conn))) + (debug:print-info 0 *default-log-port* "connection to "fullpath" server expired. Reconnecting.") + (rmt:drop-conn remdat apath ".db/main.db") ;; + (rmt:open-main-connection remdat apath)) + (else + ;; Below we will find or create and connect to main + (debug:print-info 0 *default-log-port* "rmt:open-main-connection - starting from scratch") + (let* ((dbname (db:run-id->dbname #f)) + (the-srv (rmt:find-main-server myconn apath dbname)) + (start-main-srv (lambda () ;; call IF there is no the-srv found + (mutex-lock! *connstart-mutex*) + (if (> (- (current-seconds) *last-main-start*) 5) ;; at least four seconds since last attempt to start main server + (begin + (api:run-server-process apath dbname) + (set! *last-main-start* (current-seconds)) + (thread-sleep! 1)) + (thread-sleep! 0.25)) + (mutex-unlock! *connstart-mutex*) + (rmt:open-main-connection remdat apath) ;; TODO: Add limit to number of tries + ))) + (if (not the-srv) ;; have server, try connecting to it + (start-main-srv) + (let* ((srv-addr (server-address the-srv)) ;; need serv + (ipaddr (alist-ref 'ipaddr the-srv)) + (port (alist-ref 'port the-srv)) + (srvkey (alist-ref 'servkey the-srv)) + (fullpath (db:dbname->path apath dbname)) + + (new-the-srv (make-conndat + apath: apath + dbname: dbname + fullname: fullpath + hostport: srv-addr + ;; socket: (open-nn-connection srv-addr) - TODO - open ulex connection? + ipaddr: ipaddr + port: port + srvpkt: the-srv + srvkey: srvkey ;; generated by rmt:get-signature on the server side + lastmsg: (current-seconds) + expires: (+ (current-seconds) + (server:expiration-timeout) + -2) ;; this needs to be gathered during the ping + ))) + (hash-table-set! conns fullpath new-the-srv))) + #t))))) + +;; NB// sinfo is a servdat struct +;; +(define (rmt:general-open-connection sinfo apath dbname #!key (num-tries 5)) + (assert (not (equal? dbname ".db/main.db")) "ERROR: general-open-connection should never be called with main as the db") + (let* ((mdbname ".db/main.db") ;; (db:run-id->dbname #f)) TODO: put this back to the lookup when stable + (fullname (db:dbname->path apath dbname)) + (conns (servdat-conns sinfo)) + (mconn (rmt:get-conn sinfo apath ".db/main.db")) + (dconn (rmt:get-conn sinfo apath dbname))) + #;(if (and mconn + (not (debug:print-logger))) + (begin + (debug:print-info 0 *default-log-port* "Turning on logging to main, look in logs dir for main log.") + (debug:print-logger rmt:log-to-main))) + (cond + ((and mconn + dconn + (< (current-seconds)(conndat-expires dconn))) + #t) ;; good to go + ((not mconn) ;; no channel open to main? open it... + (rmt:open-main-connection sinfo apath) + (rmt:general-open-connection sinfo apath dbname num-tries: (- num-tries 1))) + ((not dconn) ;; no channel open to dbname? + (let* ((res (rmt:send-receive-real sinfo apath mdbname 'get-server `(,apath ,dbname)))) + (case res + ((server-started) + (if (> num-tries 0) + (begin + (thread-sleep! 2) + (rmt:general-open-connection sinfo apath dbname num-tries: (- num-tries 1))) + (begin + (debug:print-error 0 *default-log-port* "Failed to start servers needed or open channel to "apath", "dbname) + (exit 1)))) + (else + (if (list? res) ;; server has been registered and the info was returned. pass it on. + (begin ;; ("192.168.0.9" 53817 + ;; "5e34239f48e8973b3813221e54701a01" "24310" + ;; "192.168.0.9" + ;; "/home/matt/data/megatest/tests/simplerun" + ;; ".db/1.db") + (match + res + ((host port servkey pid ipaddr apath dbname) + (debug:print-info 0 *default-log-port* "got "res) + (hash-table-set! conns + fullname + (make-conndat + apath: apath + dbname: dbname + hostport: (conc host":"port) + ;; socket: (open-nn-connection (conc host":"port)) ;; TODO - open ulex connection? + ipaddr: ipaddr + port: port + srvkey: servkey + lastmsg: (current-seconds) + expires: (+ (current-seconds) + (server:expiration-timeout) + -2)))) + (else + (debug:print-info 0 *default-log-port* "return data from starting server did not match host port servkey pid ipaddr apath dbname " res))) + res) + (begin + (debug:print-info 0 *default-log-port* "Unexpected result: " res) + res))))))) + #t)) + +;;====================================================================== + +;; FOR DEBUGGING SET TO #t +;; (define *localmode* #t) +(define *localmode* #f) +(define *dbstruct* (make-dbr:dbstruct)) + +;; Defaults to current area +;; +(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) + (let* ((apath *toppath*) + (sinfo *db-serv-info*) + (dbname (db:run-id->dbname rid))) + (if *localmode* + (api:execute-requests *dbstruct* cmd params) + (begin + (rmt:open-main-connection sinfo apath) + (if rid (rmt:general-open-connection sinfo apath dbname)) + #;(if (not (member cmd '(log-to-main))) + (debug:print-info 0 *default-log-port* "rmt:send-receive "cmd" params="params)) + (rmt:send-receive-real sinfo apath dbname cmd params))))) + +;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed +;; sometime in the future +;; +(define (rmt:send-receive-real sinfo apath dbname cmd params) + (assert (not (eq? 'primordial (thread-name (current-thread)))) "FATAL: Do not call rmt:send-receive-real in the primodial thread.") + (let* ((cdat (rmt:get-conn sinfo apath dbname))) + (assert cdat "FATAL: rmt:send-receive-real called without the needed channels opened") + (let* ((uconn (servdat-uconn sinfo)) ;; get the interface to ulex + ;; then send-receive using the ulex layer to host-port stored in cdat + (res (send-receive uconn (conndat-hostport cdat) cmd params)) + #;(th1 (make-thread (lambda () + (set! res (send-receive uconn (conndat-hostport cdat) cmd params))) + "send-receive thread"))) + ;; (thread-start! th1) + ;; (thread-join! th1) ;; gratuitious thread stuff is so that mailbox is not used in primordial thead + ;; since we accessed the server we can bump the expires time up + (conndat-expires-set! cdat (+ (current-seconds) + (server:expiration-timeout) + -10)) ;; ten second margin for network time misalignments etc. + res))) + +;; db is at apath/.db/dbname, rid is an intermediary solution and will be removed +;; sometime in the future. +;; +;; Purpose - call the main.db server and request a server be started +;; for the given area path and dbname +;; + +(define (rmt:print-db-stats) + (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" + (debug:print 18 *default-log-port* "DB Stats, "(seconds->year-week/day-time (current-seconds))"\n=====================") + (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) + (for-each (lambda (cmd) + (let ((cmd-dat (hash-table-ref *db-stats* cmd))) + (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0)))))) + (sort (hash-table-keys *db-stats*) + (lambda (a b) + (> (vector-ref (hash-table-ref *db-stats* a) 0) + (vector-ref (hash-table-ref *db-stats* b) 0))))))) + +(define (rmt:get-max-query-average run-id) + (mutex-lock! *db-stats-mutex*) + (let* ((runkey (conc "run-id=" run-id " ")) + (cmds (filter (lambda (x) + (substring-index runkey x)) + (hash-table-keys *db-stats*))) + (res (if (null? cmds) + (cons 'none 0) + (let loop ((cmd (car cmds)) + (tal (cdr cmds)) + (max-cmd (car cmds)) + (res 0)) + (let* ((cmd-dat (hash-table-ref *db-stats* cmd)) + (tot (vector-ref cmd-dat 0)) + (curravg (/ (vector-ref cmd-dat 1) (vector-ref cmd-dat 0))) ;; count is never zero by construction + (currmax (max res curravg)) + (newmax-cmd (if (> curravg res) cmd max-cmd))) + (if (null? tal) + (if (> tot 10) + (cons newmax-cmd currmax) + (cons 'none 0)) + (loop (car tal)(cdr tal) newmax-cmd currmax))))))) + (mutex-unlock! *db-stats-mutex*) + res)) + +;; host and port are used to ensure we are remove proper records +(define (rmt:server-shutdown host port) + (let ((dbfile (servdat-dbfile *db-serv-info*))) + (debug:print-info 0 *default-log-port* "dbfile is "dbfile) + (if dbfile + (let* ((am-server (args:get-arg "-server")) + (dbfile (args:get-arg "-db")) + (apath *toppath*) + #;(sinfo *remotedat*)) ;; foundation for future fix + (if *dbstruct-db* + (let* ((dbdat (db:get-dbdat *dbstruct-db* apath dbfile)) + (db (dbr:dbdat-db dbdat)) + (inmem (dbr:dbdat-db dbdat)) ;; WRONG + ) + ;; do a final sync here + (debug:print-info 0 *default-log-port* "Doing final sync for "apath" "dbfile" at "(current-seconds)) + (db:sync-inmem->disk *dbstruct-db* apath dbfile force-sync: #t) + ;; let's finalize here + (debug:print-info 0 *default-log-port* "Finalizing db and inmem") + (if (sqlite3:database? db) + (sqlite3:finalize! db) + (debug:print-info 0 *default-log-port* "in rmt:server-shutdown, db is not a database, not finalizing...")) + (if (sqlite3:database? inmem) + (sqlite3:finalize! inmem) + (debug:print-info 0 *default-log-port* "in rmt:server-shutdown, inmem is not a database, not finalizing...")) + (debug:print-info 0 *default-log-port* "Finalizing db and inmem complete")) + (debug:print-info 0 *default-log-port* "Db was never opened, no cleanup to do.")) + (if (not am-server) + (debug:print-info 0 *default-log-port* "I am not a server, should NOT get here!") + (if (string-match ".*/main.db$" dbfile) + (let ((pkt-file (conc (get-pkts-dir *toppath*) + "/" (servdat-uuid *db-serv-info*) + ".pkt"))) + (debug:print-info 0 *default-log-port* "removing pkt "pkt-file) + (delete-file* pkt-file) + (debug:print-info 0 *default-log-port* "Releasing lock (if any) for "dbfile ", host "host", port "port) + (db:with-lock-db + (servdat-dbfile *db-serv-info*) + (lambda (dbh dbfile) + (db:release-lock dbh dbfile host port)))) ;; I'm not the server - should not have a lock to remove + (let* ((sdat *db-serv-info*) ;; we have a run-id server + (host (servdat-host sdat)) + (port (servdat-port sdat)) + (uuid (servdat-uuid sdat)) + (res (rmt:deregister-server *db-serv-info* *toppath* host port uuid dbfile))) + (debug:print-info 0 *default-log-port* "deregistered-server, res="res) + (debug:print-info 0 *default-log-port* "deregistering server "host":"port" with uuid "uuid) + ))))))) + + +(define (common:run-sync?) + ;; (and (common:on-homehost?) + (args:get-arg "-server")) + +(define *rmt:run-mutex* (make-mutex)) +(define *rmt:run-flag* #f) + +;; Main entry point to start a server. was start-server +(define (rmt:run hostn) + (mutex-lock! *rmt:run-mutex*) + (if *rmt:run-flag* + (begin + (debug:print-warn 0 *default-log-port* "rmt:run already running.") + (mutex-unlock! *rmt:run-mutex*)) + (begin + (set! *rmt:run-flag* #t) + (mutex-unlock! *rmt:run-mutex*) + ;; ;; Configurations for server + ;; (tcp-buffer-size 2048) + ;; (max-connections 2048) + (debug:print 2 *default-log-port* "PID: "(current-process-id)". Attempting to start the server ...") + (if (and *db-serv-info* + (servdat-uconn *db-serv-info*)) + (let* ((uconn (servdat-uconn *db-serv-info*))) + (wait-and-close uconn)) + (let* ((port (portlogger:open-run-close portlogger:find-port)) + (handler-proc (lambda (rem-host-port qrykey cmd params) ;; + (set! *db-last-access* (current-seconds)) + (assert (list? params) "FATAL: handler called with non-list params") + (assert (args:get-arg "-server") "FATAL: handler called on non-server side. cmd="cmd", params="params) + (debug:print 0 *default-log-port* "handler call: "cmd", params="params) + (api:execute-requests *dbstruct-db* cmd params)))) + ;; (api:process-request *dbstuct-db* + (if (not *db-serv-info*) + (set! *db-serv-info* (make-servdat host: hostn port: port))) + (let* ((uconn (run-listener handler-proc port)) + (rport (udat-port uconn))) ;; the real port + (servdat-host-set! *db-serv-info* hostn) + (servdat-port-set! *db-serv-info* rport) + (servdat-uconn-set! *db-serv-info* uconn) + (wait-and-close uconn) + (db:print-current-query-stats) + ))) + (let* ((host (servdat-host *db-serv-info*)) + (port (servdat-port *db-serv-info*)) + (mode (or (servdat-mode *db-serv-info*) + "non-db"))) + ;; server exit stuff here + ;; (rmt:server-shutdown host port) - always do in on-exit + ;; (portlogger:open-run-close portlogger:set-port port "released") ;; moved to on-exit + (debug:print-info 0 *default-log-port* "Server "host":"port" mode "mode"shutdown complete. Exiting") + )))) + +;;====================================================================== +;; S E R V E R U T I L I T I E S +;;====================================================================== + + +;;====================================================================== +;; NEW SERVER METHOD +;;====================================================================== + +;; only use for main.db - need to re-write some of this :( +;; +(define (get-lock-db sdat dbfile host port) + (assert host "FATAL: get-lock-db called with host not set.") + (assert port "FATAL: get-lock-db called with port not set.") + (let* ((dbh (db:open-run-db dbfile db:initialize-db)) ;; open-run-db creates a standard db with schema used by all situations + (res (db:get-iam-server-lock dbh dbfile host port)) + (uconn (servdat-uconn sdat))) + ;; res => list then already locked, check server is responsive + ;; => #t then sucessfully got the lock + ;; => #f reserved for future use as to indicate something went wrong + (match res + ((owner_pid owner_host owner_port event_time) + (if (server-ready? uconn (conc owner_host":"owner_port) "abc") + #f ;; locked by someone else + (begin ;; locked by someone dead and gone + (debug:print 0 *default-log-port* "WARNING: stale lock - have to steal it. This may fail.") + (db:steal-lock-db dbh dbfile port)))) + (#t #t) ;; placeholder so that we don't touch res if it is #t + (else (set! res #f))) + (sqlite3:finalize! dbh) + res)) + + +(define (register-server pkts-dir pkt-spec host port servkey ipaddr dbpath) + (let* ((pkt-dat `((host . ,host) + (port . ,port) + (servkey . ,servkey) + (pid . ,(current-process-id)) + (ipaddr . ,ipaddr) + (dbpath . ,dbpath))) + (uuid (write-alist->pkt + pkts-dir + pkt-dat + pktspec: pkt-spec + ptype: 'server))) + (debug:print 0 *default-log-port* "Server on "host":"port" registered in pkt "uuid) + uuid)) + +(define (get-pkts-dir #!optional (apath #f)) + (let* ((effective-toppath (or *toppath* apath))) + (assert effective-toppath + "ERROR: get-pkts-dir called without *toppath* set. Exiting.") + (let* ((pdir (conc effective-toppath "/.meta/srvpkts"))) + (if (file-exists? pdir) + pdir + (begin + (handle-exceptions ;; this exception handler should NOT be needed but ... + exn + pdir + (create-directory pdir #t)) + pdir))))) + +;; given a pkts dir read +;; +(define (get-all-server-pkts pktsdir-in pktspec) + (let* ((pktsdir (if (file-exists? pktsdir-in) + pktsdir-in + (begin + (create-directory pktsdir-in #t) + pktsdir-in))) + (all-pkt-files (glob (conc pktsdir "/*.pkt")))) + (map (lambda (pkt-file) + (read-pkt->alist pkt-file pktspec: pktspec)) + all-pkt-files))) + +(define (server-address srv-pkt) + (conc (alist-ref 'host srv-pkt) ":" + (alist-ref 'port srv-pkt))) + +(define (server-ready? uconn host-port key) ;; server-address is host:port + (let* ((params `((cmd . ping)(key . ,key))) + (data `((cmd . ping) + (key . ,key) + (params . ,params))) ;; I don't get it. + (res (send-receive uconn host-port 'ping data))) + (if (eq? res 'ack) ;; yep, likely it is who we want on the other end + res + #f))) +;; (begin (debug:print-info 0 *default-log-port* "server-ready? => "res) #f)))) + +; from the pkts return servers associated with dbpath +;; NOTE: Only one can be alive - have to check on each +;; in the list of pkts returned +;; +(define (get-viable-servers serv-pkts dbpath) + (let loop ((tail serv-pkts) + (res '())) + (if (null? tail) + res ;; NOTE: sort by age so oldest is considered first + (let* ((spkt (car tail))) + (loop (cdr tail) + (if (equal? dbpath (alist-ref 'dbpath spkt)) + (cons spkt res) + res)))))) + +(define (remove-pkts-if-not-alive uconn serv-pkts) + (filter (lambda (pkt) + (let* ((host (alist-ref 'host pkt)) + (port (alist-ref 'port pkt)) + (host-port (conc host":"port)) + (key (alist-ref 'servkey pkt)) + (pktz (alist-ref 'Z pkt)) + (res (server-ready? uconn host-port key))) + (if res + res + (let* ((pktsdir (get-pkts-dir *toppath*)) + (pktpath (conc pktsdir"/"pktz".pkt"))) + (debug:print 0 *default-log-port* "WARNING: pkt with no server "pktpath) + (delete-file* pktpath) + #f)))) + serv-pkts)) + +;; from viable servers get one that is alive and ready +;; +(define (get-the-server uconn apath serv-pkts) + (let loop ((tail serv-pkts)) + (if (null? tail) + #f + (let* ((spkt (car tail)) + (host (alist-ref 'ipaddr spkt)) + (port (alist-ref 'port spkt)) + (host-port (conc host":"port)) + (dbpth (alist-ref 'dbpath spkt)) + (srvkey (alist-ref 'Z spkt)) ;; (alist-ref 'srvkey spkt)) + (addr (server-address spkt))) + (if (server-ready? uconn host-port srvkey) + spkt + (loop (cdr tail))))))) + +;; am I the "first" in line server? I.e. my D card is smallest +;; use Z card as tie breaker +;; +(define (get-best-candidate serv-pkts dbpath) + (if (null? serv-pkts) + #f + (let loop ((tail serv-pkts) + (best (car serv-pkts))) + (if (null? tail) + best + (let* ((candidate (car tail)) + (candidate-bd (string->number (alist-ref 'D candidate))) + (best-bd (string->number (alist-ref 'D best))) + ;; bigger number is younger + (candidate-z (alist-ref 'Z candidate)) + (best-z (alist-ref 'Z best)) + (new-best (cond + ((> best-bd candidate-bd) ;; best is younger than candidate + candidate) + ((< best-bd candidate-bd) ;; candidate is younger than best + best) + (else + (if (string>=? best-z candidate-z) + best + candidate))))) ;; use Z card as tie breaker + (if (null? tail) + new-best + (loop (cdr tail) new-best))))))) + + +;;====================================================================== +;; END NEW SERVER METHOD +;;====================================================================== + +;; if .db/main.db check the pkts +;; +(define (rmt:wait-for-server pkts-dir db-file server-key) + (let* ((sdat *db-serv-info*)) + (let loop ((start-time (current-seconds)) + (changed #t) + (last-sdat "not this")) + (begin ;; let ((sdat #f)) + (thread-sleep! 0.01) + (debug:print-info 0 *default-log-port* "Waiting for server alive signature") + (mutex-lock! *heartbeat-mutex*) + (set! sdat *db-serv-info*) + (mutex-unlock! *heartbeat-mutex*) + (if (and sdat + (not changed) + (> (- (current-seconds) start-time) 2)) + (let* ((uconn (servdat-uconn sdat))) + (servdat-status-set! sdat 'iface-stable) + (debug:print-info 0 *default-log-port* "Received server alive signature, now attempting to lock in server") + ;; create a server pkt in *toppath*/.meta/srvpkts + + ;; TODO: + ;; 1. change sdat to stuct + ;; 2. add uuid to struct + ;; 3. update uuid in sdat here + ;; + (servdat-uuid-set! sdat + (register-server + pkts-dir *srvpktspec* + (get-host-name) + (servdat-port sdat) server-key + (servdat-host sdat) db-file)) + ;; (set! *my-signature* (servdat-uuid sdat)) ;; replace with Z, no, stick with proper key + ;; now read pkts and see if we are a contender + (let* ((all-pkts (get-all-server-pkts pkts-dir *srvpktspec*)) + (viables (get-viable-servers all-pkts db-file)) + (alive (remove-pkts-if-not-alive uconn viables)) + (best-srv (get-best-candidate alive db-file)) + (best-srv-key (if best-srv (alist-ref 'servkey best-srv) #f)) + (i-am-srv (equal? best-srv-key server-key)) + (delete-pkt (lambda () + (let* ((pktfile (conc (get-pkts-dir *toppath*) + "/" (servdat-uuid *db-serv-info*) + ".pkt"))) + (debug:print-info 0 *default-log-port* "Attempting to remove bogus pkt file "pktfile) + (delete-file* pktfile))))) ;; remove immediately instead of waiting for on-exit + (debug:print 0 *default-log-port* "best-srv-key: "best-srv-key", server-key: "server-key", i-am-srv: "i-am-srv) + ;; am I the best-srv, compare server-keys to know + (if i-am-srv + (if (get-lock-db sdat db-file (servdat-host sdat)(servdat-port sdat)) ;; (db:get-iam-server-lock *dbstruct-db* *toppath* run-id) + (begin + (debug:print-info 0 *default-log-port* "I'm the server!") + (servdat-dbfile-set! sdat db-file) + (servdat-status-set! sdat 'db-locked)) + (begin + (debug:print-info 0 *default-log-port* "I'm not the server, exiting.") + (bdat-time-to-exit-set! *bdat* #t) + (delete-pkt) + (thread-sleep! 0.2) + (exit))) + (begin + (debug:print-info 0 *default-log-port* + "Keys do not match "best-srv-key", "server-key", exiting.") + (bdat-time-to-exit-set! *bdat* #t) + (delete-pkt) + (thread-sleep! 0.2) + (exit))) + sdat)) + (begin ;; sdat not yet contains server info + (debug:print-info 0 *default-log-port* "Still waiting, last-sdat=" last-sdat) + (sleep 4) + (if (> (- (current-seconds) start-time) 120) ;; been waiting for two minutes + (begin + (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server") + (exit)) + (loop start-time + (equal? sdat last-sdat) + sdat)))))))) + +(define (rmt:register-server sinfo apath iface port server-key dbname) + (servdat-conns sinfo) ;; just checking types + (rmt:open-main-connection sinfo apath) ;; we need a channel to main.db + (rmt:send-receive-real sinfo apath ;; params: host port servkey pid ipaddr dbpath + (db:run-id->dbname #f) + 'register-server `(,iface + ,port + ,server-key + ,(current-process-id) + ,iface + ,apath + ,dbname))) + +(define (rmt:get-count-servers sinfo apath) + (servdat-conns sinfo) ;; just checking types + (rmt:open-main-connection sinfo apath) ;; we need a channel to main.db + (rmt:send-receive-real sinfo apath ;; params: host port servkey pid ipaddr dbpath + (db:run-id->dbname #f) + 'get-count-servers `(,apath))) + +(define (rmt:get-servers-info apath) + (rmt:send-receive 'get-servers-info #f `(,apath))) + +(define (rmt:deregister-server db-serv-info apath iface port server-key dbname) + (rmt:open-main-connection db-serv-info apath) ;; we need a channel to main.db + (rmt:send-receive-real db-serv-info apath ;; params: host port servkey pid ipaddr dbpath + (db:run-id->dbname #f) + 'deregister-server `(,iface + ,port + ,server-key + ,(current-process-id) + ,iface + ,apath + ,dbname))) + +(define (rmt:wait-for-stable-interface #!optional (num-tries-allowed 100)) + ;; wait until *db-serv-info* stops changing + (let* ((stime (current-seconds))) + (let loop ((last-host #f) + (last-port #f) + (tries 0)) + (let* ((curr-host (and *db-serv-info* (servdat-host *db-serv-info*))) + (curr-port (and *db-serv-info* (servdat-port *db-serv-info*)))) + ;; first we verify port and interface, update *db-serv-info* in need be. + (cond + ((> tries num-tries-allowed) + (debug:print 0 *default-log-port* "rmt:keep-running, giving up after trying for several minutes.") + (exit 1)) + ((not *db-serv-info*) + (thread-sleep! 0.25) + (loop curr-host curr-port (+ tries 1))) + ((or (not last-host)(not last-port)) + (debug:print 0 *default-log-port* "rmt:keep-running, still no interface, tries="tries) + (thread-sleep! 0.25) + (loop curr-host curr-port (+ tries 1))) + ((or (not (equal? last-host curr-host)) + (not (equal? last-port curr-port))) + (debug:print-info 0 *default-log-port* "WARNING: interface changed, refreshing iface and port info") + (thread-sleep! 0.25) + (loop curr-host curr-port (+ tries 1))) + ((< (- (current-seconds) stime) 1) ;; keep up the looping until at least 3 seconds have passed + (thread-sleep! 0.5) + (loop curr-host curr-port (+ tries 1))) + (else + (rmt:get-signature) ;; sets *my-signature* as side effect + (servdat-status-set! *db-serv-info* 'interface-stable) + (debug:print 0 *default-log-port* + "SERVER STARTED: " curr-host + ":" curr-port + " AT " (current-seconds) " server signature: " *my-signature* + " with "(servdat-trynum *db-serv-info*)" port changes") + (flush-output *default-log-port*) + #t)))))) + +;; run rmt:keep-running in a parallel thread to monitor that the db is being +;; used and to shutdown after sometime if it is not. +;; +(define (rmt:keep-running dbname) + ;; if none running or if > 20 seconds since + ;; server last used then start shutdown + ;; This thread waits for the server to come alive + (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") + + (let* ((sinfo *db-serv-info*) + (server-start-time (current-seconds)) + (pkts-dir (get-pkts-dir)) + (server-key (rmt:get-signature)) ;; This servers key + (is-main (equal? (args:get-arg "-db") ".db/main.db")) + (last-access 0) + (server-timeout (server:expiration-timeout)) + (shutdown-server-sequence (lambda (host port) + (set! *unclean-shutdown* #f) ;; Should not be needed anymore + (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) + ;; (rmt:server-shutdown host port) -- called in on-exit + ;; (portlogger:open-run-close portlogger:set-port port "released") called in on-exit + (exit))) + (timed-out? (lambda () + (<= (+ last-access server-timeout) + (current-seconds))))) + (servdat-dbfile-set! *db-serv-info* (args:get-arg "-db")) + ;; main and run db servers have both got wait logic (could/should merge it) + (if is-main + (rmt:wait-for-server pkts-dir dbname server-key) + (rmt:wait-for-stable-interface)) + ;; this is our forever loop + (let* ((iface (servdat-host *db-serv-info*)) + (port (servdat-port *db-serv-info*)) + (uconn (servdat-uconn *db-serv-info*))) + (let loop ((count 0) + (bad-sync-count 0) + (start-time (current-milliseconds))) + (if (and (not is-main) + (common:low-noise-print 60 "servdat-status")) + (debug:print-info 0 *default-log-port* "servdat-status is " (servdat-status *db-serv-info*))) + + (mutex-lock! *heartbeat-mutex*) + ;; set up the database handle + (if (not *dbstruct-db*) ;; no db opened yet, open the db and register with main if appropriate + (let ((watchdog (bdat-watchdog *bdat*))) + (debug:print 0 *default-log-port* "SERVER: dbprep") + (db:setup dbname) ;; sets *dbstruct-db* as side effect + (servdat-status-set! *db-serv-info* 'db-opened) + ;; IFF I'm not main, call into main and register self + (if (not is-main) + (let ((res (rmt:register-server sinfo + *toppath* iface port + server-key dbname))) + (if res ;; we are the server + (servdat-status-set! *db-serv-info* 'have-interface-and-db) + ;; now check that the db locker is alive, clear it out if not + (let* ((serv-info (rmt:server-info *toppath* dbname))) + (match serv-info + ((host port servkey pid ipaddr apath dbpath) + (if (not (server-ready? uconn (conc host":"port) servkey)) + (begin + (debug:print-info 0 *default-log-port* "Server registered but not alive. Removing and trying again.") + (rmt:deregister-server sinfo apath host port servkey dbpath) ;; servkey pid ipaddr apath dbpath) + (loop (+ count 1) bad-sync-count start-time)))) + (else + (debug:print 0 *default-log-port* "We are not the server for "dbname", exiting. Server info is: "serv-info) + (exit))))))) + (debug:print 0 *default-log-port* + "SERVER: running, db "dbname" opened, megatest version: " + (common:get-full-version)) + ;; start the watchdog + + ;; is this really needed? + + #;(if watchdog + (if (not (member (thread-state watchdog) + '(ready running blocked + sleeping dead))) + (begin + (debug:print-info 0 *default-log-port* "Starting watchdog thread (in state "(thread-state watchdog)")") + (thread-start! watchdog)) + (debug:print-info 0 *default-log-port* "Not starting watchdog thread (in state "(thread-state watchdog)")")) + (debug:print 0 *default-log-port* "ERROR: *watchdog* not setup, cannot start it.")) + #;(loop (+ count 1) bad-sync-count start-time) + )) + + (db:sync-inmem->disk *dbstruct-db* *toppath* dbname force-sync: #t) + + (mutex-unlock! *heartbeat-mutex*) + + ;; when things go wrong we don't want to be doing the various + ;; queries too often so we strive to run this stuff only every + ;; four seconds or so. + (let* ((sync-time (- (current-milliseconds) start-time)) + (rem-time (quotient (- 4000 sync-time) 1000))) + (if (and (<= rem-time 4) + (> rem-time 0)) + (thread-sleep! rem-time))) + + ;; Transfer *db-last-access* to last-access to use in checking that we are still alive + (set! last-access *db-last-access*) + + (if (< count 1) ;; 3x3 = 9 secs aprox + (loop (+ count 1) bad-sync-count (current-milliseconds))) + + (if (common:low-noise-print 60 "dbstats") + (begin + (debug:print 0 *default-log-port* "Server stats:") + (db:print-current-query-stats))) + (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) + (cond + ((not *server-run*) + (debug:print-info 0 *default-log-port* "*server-run* set to #f. Shutting down.") + (shutdown-server-sequence (get-host-name) port)) + ((timed-out?) + (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) + (shutdown-server-sequence (get-host-name) port)) + ((and *server-run* + (or (not (timed-out?)) + (if is-main ;; do not exit if there are other servers (keep main open until all others gone) + (> (rmt:get-count-servers sinfo *toppath*) 1) + #f))) + (if (common:low-noise-print 120 "server continuing") + (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) + (loop 0 bad-sync-count (current-milliseconds))) + (else + (set! *unclean-shutdown* #f) + (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) + (shutdown-server-sequence (get-host-name) port) + #;(debug:print-info 0 *default-log-port* "Sending 'quit to server, received: " + (open-send-receive-nn (conc iface":"port) ;; do this here and not in server-shutdown + (sexpr->string 'quit)))))))))) + +(define (rmt:get-reasonable-hostname) + (let* ((inhost (or (args:get-arg "-server") "-"))) + (if (equal? inhost "-") + (get-host-name) + inhost))) + +;; Call this to start the actual server +;; +;; all routes though here end in exit ... +;; +;; This is the point at which servers are started +;; +(define (rmt:server-launch dbname) + (debug:print-info 0 *default-log-port* "Entered rmt:server-launch") + (let* ((th2 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server run thread started") + (rmt:run (rmt:get-reasonable-hostname))) + "Server run")) + (th3 (make-thread (lambda () + (debug:print-info 0 *default-log-port* "Server monitor thread started") + (if (args:get-arg "-server") + (rmt:keep-running dbname))) + "Keep running"))) + (thread-start! th2) + (thread-sleep! 0.252) ;; give the server time to settle before starting the keep-running monitor. + (thread-start! th3) + (set! *didsomething* #t) + (thread-join! th2) + (thread-join! th3)) + #f) + +;;====================================================================== +;; S E R V E R - D I R E C T C A L L S +;;====================================================================== + +(define (rmt:kill-server run-id) + (rmt:send-receive 'kill-server #f (list run-id))) + +(define (rmt:start-server run-id) + (rmt:send-receive 'start-server #f (list run-id))) + +(define (rmt:server-info apath dbname) + (rmt:send-receive 'get-server-info #f (list apath dbname))) + +;;====================================================================== +;; Nanomsg transport +;;====================================================================== + +#;(define (is-port-in-use port-num) + (let* ((ret #f)) + (let-values (((inp oup pid) + (process "netstat" (list "-tulpn" )))) + (let loop ((inl (read-line inp))) + (if (not (eof-object? inl)) + (begin + (if (string-search (regexp (conc ":" port-num)) inl) + (begin + ;(print "Output: " inl) + (set! ret #t)) + (loop (read-line inp))))))) + ret)) + +#;(define (open-nn-connection host-port) + (let ((req (make-req-socket)) + (uri (conc "tcp://" host-port))) + (nng-dial req uri) + (socket-set! req 'nng/recvtimeo 2000) + req)) + +#;(define (send-receive-nn req msg) + (nng-send req msg) + (nng-recv req)) + +#;(define (close-nn-connection req) + (nng-close! req)) + +;; ;; open connection to server, send message, close connection +;; ;; +;; (define (open-send-close-nn host-port msg #!key (timeout 3) ) ;; default timeout is 3 seconds +;; (let ((req (make-req-socket 'req)) +;; (uri (conc "tcp://" host-port)) +;; (res #f) +;; ;; (contacts (alist-ref 'contact attrib)) +;; ;; (mode (alist-ref 'mode attrib)) +;; ) +;; (socket-set! req 'nng/recvtimeo 2000) +;; (handle-exceptions +;; exn +;; (let ((emsg ((condition-property-accessor 'exn 'message) exn))) +;; ;; Send notification +;; (debug:print 0 *default-log-port* "ERROR: Failed to connect / send to " uri " message was \"" emsg "\"" ) +;; #f) +;; (nng-dial req uri) +;; ;; (print "Connected to the server " ) +;; (nng-send req msg) +;; ;; (print "Request Sent") +;; (let* ((th1 (make-thread (lambda () +;; (let ((resp (nng-recv req))) +;; (nng-close! req) +;; (set! res (if (equal? resp "ok") +;; #t +;; #f)))) +;; "recv thread")) +;; (th2 (make-thread (lambda () +;; (thread-sleep! timeout) +;; (thread-terminate! th1)) +;; "timer thread"))) +;; (thread-start! th1) +;; (thread-start! th2) +;; (thread-join! th1) +;; res)))) +;; +#;(define (open-send-receive-nn host-port msg #!key (timeout 3) ) ;; default timeout is 3 seconds + (let ((req (make-req-socket)) + (uri (conc "tcp://" host-port)) + (res #f)) + (handle-exceptions + exn + (let ((emsg ((condition-property-accessor 'exn 'message) exn))) + ;; Send notification + (debug:print 0 *default-log-port* "ERROR: Failed to connect / send to " uri " message was \"" emsg "\", exn=" exn) + #f) + (nng-dial req uri) + (nng-send req msg) + (let* ((th1 (make-thread (lambda () + (let ((resp (nng-recv req))) + (nng-close! req) + ;; (print resp) + (set! res resp))) + "recv thread")) + (th2 (make-thread (lambda () + (thread-sleep! timeout) + (thread-terminate! th1)) + "timer thread"))) + (thread-start! th1) + (thread-start! th2) + (thread-join! th1) + res)))) + +;;====================================================================== +;; S E R V E R U T I L I T I E S +;;====================================================================== + +;; run ping in separate process, safest way in some cases +;; +#;(define (server:ping-server ifaceport) + (with-input-from-pipe + (conc (common:get-megatest-exe) " -ping " ifaceport) + (lambda () + (let loop ((inl (read-line)) + (res "NOREPLY")) + (if (eof-object? inl) + (case (string->symbol res) + ((NOREPLY) #f) + ((LOGIN_OK) #t) + (else #f)) + (loop (read-line) inl)))))) + +;; NOT USED (well, ok, reference in rpc-transport but otherwise not used). +;; +#;(define (server:login toppath) + (lambda (toppath) + (set! *db-last-access* (current-seconds)) ;; might not be needed. + (if (equal? *toppath* toppath) + #t + #f))) + +;; (define server:sync-lock-token "SERVER_SYNC_LOCK") +;; (define (server:release-sync-lock) +;; (db:no-sync-del! *no-sync-db* server:sync-lock-token)) +;; (define (server:have-sync-lock?) +;; (let* ((have-lock-pair (db:no-sync-get-lock *no-sync-db* server:sync-lock-token)) +;; (have-lock? (car have-lock-pair)) +;; (lock-time (cdr have-lock-pair)) +;; (lock-age (- (current-seconds) lock-time))) +;; (cond +;; (have-lock? #t) +;; ((>lock-age +;; (* 3 (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 180))) +;; (server:release-sync-lock) +;; (server:have-sync-lock?)) +;; (else #f)))) + +) Index: ulex/ulex.scm ================================================================== --- ulex/ulex.scm +++ ulex/ulex.scm @@ -1,8 +1,8 @@ ;; ulex: Distributed sqlite3 db ;;; -;; Copyright (C) 2018 Matt Welland +;; Copyright (C) 2018-2021 Matt Welland ;; Redistribution and use in source and binary forms, with or without ;; modification, is permitted. ;; ;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS ;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED @@ -23,330 +23,521 @@ ;; NOTES: ;; Why sql-de-lite and not say, dbi? - performance mostly, then simplicity. ;; ;;====================================================================== -(use mailbox) - -(module ulex - * - -(import scheme posix chicken data-structures ports extras files mailbox) -(import srfi-18 pkts matchable regex - typed-records srfi-69 srfi-1 - srfi-4 regex-case - (prefix sqlite3 sqlite3:) - foreign - tcp6 - ;; ulex-netutil - hostinfo - ) - -;; make it a global? Well, it is local to area module - -(define *captain-pktspec* - `((captain (host . h) - (port . p) - (pid . i) - (ipaddr . a) - ) - #;(data (hostname . h) ;; sender hostname - (port . p) ;; sender port - (ipaddr . a) ;; sender ip - (hostkey . k) ;; sending host key - store info at server under this key - (servkey . s) ;; server key - this needs to match at server end or reject the msg - (format . f) ;; sb=serialized-base64, t=text, sx=sexpr, j=json - (data . d) ;; base64 encoded slln data - ))) - -;; struct for keeping track of our world - -(defstruct udat - ;; captain info - (captain-address #f) - (captain-host #f) - (captain-port #f) - (captain-pid #f) - (captain-lease 0) ;; time (unix epoc) seconds when the lease is up - (ulex-dir (conc (get-environment-variable "HOME") "/.ulex")) - (cpkts-dir (conc (get-environment-variable "HOME") "/.ulex/pkts")) - (cpkt-spec *captain-pktspec*) - ;; this processes info - (my-cpkt-key #f) ;; put Z card here when I create a pkt for myself as captain - (my-address #f) - (my-hostname #f) - (my-port #f) - (my-pid (current-process-id)) - (my-dbs '()) - ;; server and handler thread - (serv-listener #f) ;; this processes server info - (handler-thread #f) - (mboxes (make-hash-table)) ;; key => mbox - ;; other servers - (peers (make-hash-table)) ;; host-port => peer record - (dbowners (make-hash-table)) ;; dbfile => host-port - (handlers (make-hash-table)) ;; dbfile => proc - ;; (outgoing-conns (make-hash-table)) ;; host:port -> conn - (work-queue (make-queue)) ;; most stuff goes here - ;; (fast-queue (make-queue)) ;; super quick stuff goes here (e.g. ping) - (busy #f) ;; is either of the queues busy, use to switch between queuing tasks or doing immediately - ;; app info - (appname #f) - (dbtypes (make-hash-table)) ;; this should be an alist but hash is easier. dbtype => [ initproc syncproc ] - ;; cookies - (cnum 0) ;; cookie num - ) - -;;====================================================================== -;; NEW APPROACH -;;====================================================================== - -;; start-server-find-port ;; gotta have a server port ready from the very begining - -;; udata - all the connection info, captain, server, ulex db etc. MUST BE PASSED IN -;; dbpath - full path and filename of the db to talk to or a symbol naming the db? -;; callname - the remote call to execute -;; params - parameters to pass to the remote call -;; -(define (remote-call udata dbpath dbtype callname . params) - (start-server-find-port udata) ;; ensure we have a local server - (find-or-setup-captain udata) - ;; look at connect, process-request, send, send-receive - (let-values (((cookie-key host-port)(get-db-owner udata dbpath dbtype))) - (send-receive udata host-port callname cookie-key params))) - -;;====================================================================== -;; KEY FUNCTIONS - THESE ARE TOO BE EXPOSED AND USED -;;====================================================================== - -;; connection setup and management functions - -;; This is the basic setup command. Must always be -;; called before connecting to a db using connect. -;; -;; find or become the captain -;; setup and return a ulex object -;; -(define (find-or-setup-captain udata) - ;; see if we already have a captain and if the lease is ok - (if (and (udat-captain-address udata) - (udat-captain-port udata) - (< (current-seconds) (udat-captain-lease udata))) - udata - (let* ((cpkts (get-all-captain-pkts udata)) ;; read captain pkts - (captn (get-winning-pkt cpkts))) - (if captn - (let* ((port (alist-ref 'port captn)) - (host (alist-ref 'host captn)) - (ipaddr (alist-ref 'ipaddr captn)) - (pid (alist-ref 'pid captn)) - (Z (alist-ref 'Z captn))) - (udat-captain-address-set! udata ipaddr) - (udat-captain-host-set! udata host) - (udat-captain-port-set! udata port) - (udat-captain-pid-set! udata pid) - (udat-captain-lease-set! udata (+ (current-seconds) 10)) - (let-values (((success pingtime)(ping udata (conc ipaddr ":" port)))) - (if success - udata - (begin - (print "Found unreachable captain at " ipaddr ":" port ", removing pkt") - (remove-captain-pkt udata captn) - (find-or-setup-captain udata)))) - (begin - (setup-as-captain udata) ;; this saves the thread to captain-thread and starts the thread - (find-or-setup-captain udata))))))) - -;; connect to a specific dbfile -;; - if already connected - return the dbowner host-port -;; - ask the captain who to talk to for this db -;; - put the entry in the dbowners hash as dbfile => host-port -;; -(define (connect udata dbfname dbtype) - (or (hash-table-ref/default (udat-dbowners udata) dbfname #f) - (let-values (((success dbowner-host-port)(get-db-owner udata dbfname dbtype))) - (if success - (begin - ;; just clobber the record, this is the new data no matter what - (hash-table-set! (udat-dbowners udata) dbfname dbowner-host-port) - dbowner-host-port) - #f)))) - -;; returns: success pingtime -;; -;; NOTE: causes the callee to store the info on this host along with the dbs this host currently owns -;; -(define (ping udata host-port) - (let* ((start (current-milliseconds)) - (cookie (make-cookie udata)) - (dbs (udat-my-dbs udata)) - (msg (string-intersperse dbs " ")) - (res (send udata host-port 'ping cookie msg retval: #t)) - (delta (- (current-milliseconds) start))) - (values (equal? res cookie) delta))) - -;; returns: success pingtime -;; -;; NOTE: causes all references to this worker to be wiped out in the -;; callee (ususally the captain) -;; -(define (goodbye-ping udata host-port) - (let* ((start (current-milliseconds)) - (cookie (make-cookie udata)) - (dbs (udat-my-dbs udata)) - (res (send udata host-port 'goodbye cookie "nomsg" retval: #t)) - (delta (- (current-milliseconds) start))) - (values (equal? res cookie) delta))) - -(define (goodbye-captain udata) - (let* ((host-port (udat-captain-host-port udata))) - (if host-port - (goodbye-ping udata host-port) - (values #f -1)))) - -(define (get-db-owner udata dbname dbtype) - (let* ((host-port (udat-captain-host-port udata))) - (if host-port - (let* ((cookie (make-cookie udata)) - (msg #f) ;; (conc dbname " " dbtype)) - (params `(,dbname ,dbtype)) - (res (send udata host-port 'db-owner cookie msg - params: params retval: #t))) - (match (string-split res) - ((retcookie owner-host-port) - (values (equal? retcookie cookie) owner-host-port)))) - (values #f -1)))) - -;; called in ulex-handler to dispatch work, called on the workers side -;; calls (proc params data) -;; returns result with cookie -;; -;; pdat is the info of the caller, used to send the result data -;; prockey is key into udat-handlers hash dereferencing a proc -;; procparam is a first param handed to proc - often to do further derefrencing -;; NOTE: params is intended to be a list of strings, encoding on data -;; is up to the user but data must be a single line -;; -(define (process-request udata pdat dbname cookie prockey procparam data) - (let* ((dbrec (ulex-open-db udata dbname)) ;; this will be a dbconn record, looks for in udata first - (proc (hash-table-ref udata prockey))) - (let* ((result (proc dbrec procparam data))) - result))) - -;; remote-request - send to remote to process in process-request -;; uconn comes from a call to connect and can be used instead of calling connect again -;; uconn is the host-port to call -;; we send dbname to the worker so they know which file to open -;; data must be a string with no newlines, it will be handed to the proc -;; at the remote site unchanged. It is up to the user to encode/decode it's contents -;; -;; rtype: immediate, read-only, normal, low-priority -;; -(define (remote-request udata uconn rtype dbname prockey procparam data) - (let* ((cookie (make-cookie udata))) - (send-receive udata uconn rtype cookie data `(,prockey procparam)))) - -(define (ulex-open-db udata dbname) - #f) - - -;;====================================================================== -;; Ulex db -;; -;; - track who is captain, lease expire time -;; - track who owns what db, lease -;; -;;====================================================================== - -;; -;; -(define (ulex-dbfname) - (let ((dbdir (conc (get-environment-variable "HOME") "/.ulex"))) - (if (not (file-exists? dbdir)) - (create-directory dbdir #t)) - (conc dbdir "/network.db"))) - -;; always goes in ~/.ulex/network.db -;; role is captain, adjutant, node -;; -(define (ulexdb-setup) - (let* ((dbfname (ulex-dbfname)) - (have-db (file-exists? dbfname)) - (db (sqlite3:open-database dbfname))) - (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) - (sqlite3:execute db "PRAGMA synchronous = 0;") - (if (not have-db) - (sqlite3:with-transaction - db - (lambda () - (for-each - (lambda (stmt) - (if stmt (sqlite3:execute db stmt))) - `("CREATE TABLE IF NOT EXISTS nodes - (id INTEGER PRIMARY KEY, - role TEXT NOT NULL, - host TEXT NOT NULL, - port TEXT NOT NULL, - ipadr TEXT NOT NULL, - pid INTEGER NOT NULL, - zcard TEXT NOT NULL, - regtime INTEGER DEFAULT (strftime('%s','now')), - lease_thru INTEGER DEFAULT (strftime('%s','now')), - last_update INTEGER DEFAULT (strftime('%s','now')));" - "CREATE TRIGGER IF NOT EXISTS update_nodes_trigger AFTER UPDATE ON nodes - FOR EACH ROW - BEGIN - UPDATE nodes SET last_update=(strftime('%s','now')) - WHERE id=old.id; - END;" - "CREATE TABLE IF NOT EXISTS dbs - (id INTEGER PRIMARY KEY, - dbname TEXT NOT NULL, - dbfile TEXT NOT NULL, - dbtype TEXT NOT NULL, - host_port TEXT NOT NULL, - regtime INTEGER DEFAULT (strftime('%s','now')), - lease_thru INTEGER DEFAULT (strftime('%s','now')), - last_update INTEGER DEFAULT (strftime('%s','now')));" - "CREATE TRIGGER IF NOT EXISTS update_dbs_trigger AFTER UPDATE ON dbs - FOR EACH ROW - BEGIN - UPDATE dbs SET last_update=(strftime('%s','now')) - WHERE id=old.id; - END;"))))) - db)) - -(define (get-host-port-lease db dbfname) - (sqlite3:fold-row - (lambda (rem host-port lease-thru) - (list host-port lease-thru)) - #f db "SELECT host_port,lease_thru FROM dbs WHERE dbfile = ?" dbfname)) - -(define (register-captain db host ipadr port pid zcard #!key (lease 20)) - (let* ((dbfname (ulex-dbfname)) - (host-port (conc host ":" port))) - (sqlite3:with-transaction - db - (lambda () - (match (get-host-port-lease db dbfname) - ((host-port lease-thru) - (if (> (current-seconds) lease-thru) - (begin - (sqlite3:execute db "UPDATE dbs SET host_port=?,lease_thru=? WHERE dbname=?" - (conc host ":" port) - (+ (current-seconds) lease) - dbfname) - #t) - #f)) - (#f (sqlite3:execute db "INSERT INTO dbs (dbname,dbfile,dbtype,host_port,lease_thru) VALUES (?,?,?,?,?)" - "captain" dbfname "captain" host-port (+ (current-seconds) lease))) - (else (print "ERROR: Unrecognised result from fold-row") - (exit 1))))))) - +(module ulex + * + #;( + + ;; NOTE: looking for the handler proc - find the run-listener :) + + run-listener ;; (run-listener handler-proc [port]) => uconn + + ;; NOTE: handler-proc params; + ;; (handler-proc rem-host-port qrykey cmd params) + + send-receive ;; (send-receive uconn host-port cmd data) + + ;; NOTE: cmd can be any plain text symbol except for these; + ;; 'ping 'ack 'goodbye 'response + + set-work-handler ;; (set-work-handler proc) + + wait-and-close ;; (wait-and-close uconn) + + ulex-listener? + + ;; needed to get the interface:port that was automatically found + udat-port + udat-host-port + + ;; for testing only + ;; pp-uconn + + ;; parameters + work-method ;; parameter; 'threads, 'mailbox, 'limited, 'direct + return-method ;; parameter; 'mailbox, 'polling, 'direct + ) + +(import scheme + chicken.base + chicken.file + chicken.io + chicken.time + chicken.condition + chicken.string + chicken.sort + chicken.pretty-print + + address-info + mailbox + matchable + ;; queues + regex + regex-case + simple-exceptions + s11n + srfi-1 + srfi-18 + srfi-4 + srfi-69 + system-information + tcp6 + typed-records + ) + +;; udat struct, used by both caller and callee +;; instantiated as uconn by convention +;; +(defstruct udat + ;; the listener side + (port #f) + (host-port #f) + (socket #f) + ;; the peers + (peers (make-hash-table)) ;; host:port->peer + ;; work handling + (work-queue (make-mailbox)) + (work-proc #f) ;; set by user + (cnum 0) ;; cookie number + (mboxes (make-hash-table)) ;; for the replies + (avail-cmboxes '()) ;; list of ( . ) for re-use + ;; threads + (numthreads 10) + (cmd-thread #f) + (work-queue-thread #f) + (num-threads-running 0) + ) + +;; Parameters + +;; work-method: +(define work-method (make-parameter 'mailbox)) +;; mailbox - all rdat goes through mailbox +;; threads - all rdat immediately executed in new thread +;; direct - no queuing +;; + +;; return-method, return the result to waiting send-receive: +(define return-method (make-parameter 'mailbox)) +;; mailbox - create a mailbox and use it for passing returning results to send-receive +;; polling - put the result in a hash table keyed by qrykey and send-receive can poll it for result +;; direct - no queuing, result is passed back in single tcp connection +;; + +;; ;; struct for keeping track of others we are talking to +;; ;; +;; (defstruct pdat +;; (host-port #f) +;; (conns '()) ;; list of pcon structs, pop one off when calling the peer +;; ) +;; +;; ;; struct for peer connections, keep track of expiration etc. +;; ;; +;; (defstruct pcon +;; (inp #f) +;; (oup #f) +;; (exp (+ (current-seconds) 59)) ;; expires at this time, set to (+ (current-seconds) 59) +;; (lifetime (+ (current-seconds) 600)) ;; throw away and create new after five minutes +;; ) + +;;====================================================================== +;; listener +;;====================================================================== + +;; is uconn a ulex connector (listener) +;; +(define (ulex-listener? uconn) + (udat? uconn)) + +;; create a tcp listener and return a populated udat struct with +;; my port, address, hostname, pid etc. +;; return #f if fail to find a port to allocate. +;; +;; if udata-in is #f create the record +;; if there is already a serv-listener return the udata +;; +(define (setup-listener uconn #!optional (port 4242)) + (handle-exceptions + exn + (if (< port 65535) + (setup-listener uconn (+ port 1)) + #f) + (connect-listener uconn port))) + +(define (connect-listener uconn port) + ;; (tcp-listener-socket LISTENER)(socket-name so) + ;; sockaddr-address, sockaddr-port, sockaddr->string + (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]]) + (addr (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname))) + (udat-port-set! uconn port) + (udat-host-port-set! uconn (conc addr":"port)) + (udat-socket-set! uconn tlsn) + uconn)) + +;; run-listener does all the work of starting a listener in a thread +;; it then returns control +;; +(define (run-listener handler-proc #!optional (port-suggestion 4242)) + (let* ((uconn (make-udat))) + (udat-work-proc-set! uconn handler-proc) + (if (setup-listener uconn port-suggestion) + (let* ((th1 (make-thread (lambda ()(ulex-cmd-loop uconn)) "Ulex command loop")) + (th2 (make-thread (lambda () + (case (work-method) + ((mailbox limited) + (process-work-queue uconn)))) + "Ulex work queue processor"))) + ;; (tcp-buffer-size 2048) + (thread-start! th1) + (thread-start! th2) + (udat-cmd-thread-set! uconn th1) + (udat-work-queue-thread-set! uconn th2) + (print "cmd loop and process workers started, listening on "(udat-host-port uconn)".") + uconn) + (assert #f "ERROR: run-listener called without proper setup.")))) + +(define (wait-and-close uconn) + (thread-join! (udat-cmd-thread uconn)) + (tcp-close (udat-socket uconn))) + +;;====================================================================== +;; peers and connections +;;====================================================================== + +(define *send-mutex* (make-mutex)) + +;; send structured data to recipient +;; +;; NOTE: qrykey is what was called the "cookie" previously +;; +;; retval tells send to expect and wait for return data (one line) and return it or time out +;; this is for ping where we don't want to necessarily have set up our own server yet. +;; +;; NOTE: see below for beginnings of code to allow re-use of tcp connections +;; - I believe (without substantial evidence) that re-using connections will +;; be beneficial ... +;; +(define (send udata host-port qrykey cmd params) + (let* ((my-host-port (udat-host-port udata)) ;; remote will return to this + (isme #f #;(equal? host-port my-host-port)) ;; calling myself? + ;; dat is a self-contained work block that can be sent or handled locally + (dat (list my-host-port qrykey cmd params #;(cons (current-seconds)(current-milliseconds))))) + (cond + (isme (ulex-handler udata dat)) ;; no transmission needed + (else + (handle-exceptions ;; TODO - MAKE THIS EXCEPTION CMD SPECIFIC? + exn + (message exn) + (begin + ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP + (let-values (((inp oup)(tcp-connect host-port))) + (let ((res (if (and inp oup) + (begin + (serialize dat oup) + (close-output-port oup) + (deserialize inp) + ) + (begin + (print "ERROR: send called but no receiver has been setup. Please call setup first!") + #f)))) + (close-input-port inp) + ;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP + res)))))))) ;; res will always be 'ack unless return-method is direct + +(define (send-via-polling uconn host-port cmd data) + (let* ((qrykey (make-cookie uconn)) + (sres (send uconn host-port qrykey cmd data))) + (case sres + ((ack) + (let loop ((start-time (current-milliseconds))) + (if (> (current-milliseconds)(+ start-time 10000)) ;; ten seconds timeout + (begin + (print "ULEX ERROR: timed out waiting for response from "host-port", "cmd" "data) + #f) + (let* ((result (hash-table-ref/default (udat-mboxes uconn) qrykey #f))) ;; NOTE: we are re-using mboxes hash + (if result ;; result is '(status . result-data) or #f for nothing yet + (begin + (hash-table-delete! (udat-mboxes uconn) qrykey) + (cdr result)) + (begin + (thread-sleep! 0.01) + (loop start-time))))))) + (else + (print "ULEX ERROR: Communication failed? sres="sres) + #f)))) + +(define (send-via-mailbox uconn host-port cmd data) + (let* ((cmbox (get-cmbox uconn)) ;; would it be better to keep a stack of mboxes to reuse? + (qrykey (car cmbox)) + (mbox (cdr cmbox)) + (mbox-time (current-milliseconds)) + (sres (send uconn host-port qrykey cmd data))) ;; short res + (if (eq? sres 'ack) ;; BUG: change to be less than server:expiration-timeout? + (let* ((mbox-timeout-secs 120 #;(if (eq? 'primordial (thread-name (current-thread))) + #f + 120)) ;; timeout) + (mbox-timeout-result 'MBOX_TIMEOUT) + (res (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result)) + (mbox-receive-time (current-milliseconds))) + ;; (put-cmbox uconn cmbox) ;; reuse mbox and cookie. is it worth it? + (hash-table-delete! (udat-mboxes uconn) qrykey) + (if (eq? res 'MBOX_TIMEOUT) + (begin + (print "WARNING: mbox timed out for query "cmd", with data "data + ", waiting for response from "host-port".") + + ;; here it might make sense to clean up connection records and force clean start? + ;; NO. The progam using ulex needs to do the reset. Right thing here is exception + + #f) ;; convert to raising exception? + res)) + (begin + (print "ERROR: Communication failed? Got "sres) + #f)))) + +;; send a request to the given host-port and register a mailbox in udata +;; wait for the mailbox data and return it +;; +(define (send-receive uconn host-port cmd data) + (let* ((start-time (current-milliseconds)) + (result (cond + ((member cmd '(ping goodbye)) ;; these are immediate + (send uconn host-port 'ping cmd data)) + ((eq? (work-method) 'direct) + ;; the result from send will be the actual result, not an 'ack + (send uconn host-port 'direct cmd data)) + (else + (case (return-method) + ((polling) + (send-via-polling uconn host-port cmd data)) + ((mailbox) + (send-via-mailbox uconn host-port cmd data)) + (else + (print "ULEX ERROR: unrecognised return-method "(return-method)".") + #f))))) + (duration (- (current-milliseconds) start-time))) + ;; this is ONLY for development and debugging. It will be removed once Ulex is stable. + (if (< 5000 duration) + (print "ULEX WARNING: round-trip took "(inexact->exact (round (/ duration 1000))) + " seconds; "cmd", host-port="host-port", data="data)) + result)) + + +;;====================================================================== +;; responder side +;;====================================================================== + +;; take a request, rdat, and if not immediate put it in the work queue +;; +;; Reserved cmds; ack ping goodbye response +;; +(define (ulex-handler uconn rdat) + (assert (list? rdat) "FATAL: ulex-handler give rdat as not list") + (match rdat ;; (string-split controldat) + ((rem-host-port qrykey cmd params);; timedata) + ;; (print "ulex-handler got: "rem-host-port" qrykey: "qrykey" cmd: "cmd" params: "params) + (case cmd + ;; ((ack )(print "Got ack! But why? Should NOT get here.") 'ack) + ((ping) + ;; (print "Got Ping!") + ;; (add-to-work-queue uconn rdat) + 'ack) + ((goodbye) + ;; just clear out references to the caller. NOT COMPLETE + (add-to-work-queue uconn rdat) + 'ack) + ((response) ;; this is a result from remote processing, send it as mail ... + (case (return-method) + ((polling) + (hash-table-set! (udat-mboxes uconn) qrykey (cons 'ok params)) + 'ack) + ((mailbox) + (let ((mbox (hash-table-ref/default (udat-mboxes uconn) qrykey #f))) + (if mbox + (begin + (mailbox-send! mbox params) ;; params here is our result + 'ack) + (begin + (print "ERROR: received result but no associated mbox for cookie "qrykey) + 'no-mbox-found)))) + (else (print "ULEX ERROR: unrecognised return-method "(return-method)) + 'bad-return-method))) + (else ;; generic request - hand it to the work queue + (add-to-work-queue uconn rdat) + 'ack))) + (else + (print "ULEX ERROR: bad rdat "rdat) + 'bad-rdat))) + +;; given an already set up uconn start the cmd-loop +;; +(define (ulex-cmd-loop uconn) + (let* ((serv-listener (udat-socket uconn)) + (listener (lambda () + (let loop ((state 'start)) + (let-values (((inp oup)(tcp-accept serv-listener))) + ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP + (let* ((rdat (deserialize inp)) ;; '(my-host-port qrykey cmd params) + (resp (ulex-handler uconn rdat))) + (serialize resp oup) + (close-input-port inp) + (close-output-port oup) + ;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP + ) + (loop state)))))) + ;; start N of them + (let loop ((thnum 0) + (threads '())) + (if (< thnum 100) + (let* ((th (make-thread listener (conc "listener" thnum)))) + (thread-start! th) + (loop (+ thnum 1) + (cons th threads))) + (map thread-join! threads))))) + +;; add a proc to the cmd list, these are done symetrically (i.e. in all instances) +;; so that the proc can be dereferenced remotely +;; +(define (set-work-handler uconn proc) + (udat-work-proc-set! uconn proc)) + +;;====================================================================== +;; work queues - this is all happening on the listener side +;;====================================================================== + +;; rdat is (rem-host-port qrykey cmd params) + +(define (add-to-work-queue uconn rdat) + #;(queue-add! (udat-work-queue uconn) rdat) + (case (work-method) + ((threads) + (thread-start! (make-thread (lambda () + (do-work uconn rdat)) + "worker thread"))) + ((mailbox) + (mailbox-send! (udat-work-queue uconn) rdat)) + ((direct) + (do-work uconn rdat)) + (else + (print "ULEX ERROR: work-method "(work-method)" not recognised, using mailbox.") + (mailbox-send! (udat-work-queue uconn) rdat)))) + +;; move the logic to return the result somewhere else? +;; +(define (do-work uconn rdat) + (let* ((proc (udat-work-proc uconn))) ;; get it each time - conceivebly it could change + ;; put this following into a do-work procedure + (match rdat + ((rem-host-port qrykey cmd params) + (let* ((start-time (current-milliseconds)) + (result (proc rem-host-port qrykey cmd params)) + (end-time (current-milliseconds)) + (run-time (- end-time start-time))) + (case (work-method) + ((direct) result) + (else + (if (> run-time 1000)(print "ULEX: Warning, work "cmd", "params" done in "run-time" ms")) + ;; send 'response as cmd and result as params + (send uconn rem-host-port qrykey 'response result) ;; could check for ack + (let* ((duration (- (current-milliseconds) end-time))) + (if (> duration 500)(print "ULEX: Warning, response sent back to "rem-host-port" for "qrykey" in "duration"ms"))))))) + (MBOX_TIMEOUT 'do-work-timeout) + (else + (print "ERROR: rdat "rdat", did not match rem-host-port qrykey cmd params"))))) + +;; NEW APPROACH: +;; +(define (process-work-queue uconn) + (let ((wqueue (udat-work-queue uconn)) + (proc (udat-work-proc uconn)) + (numthr (udat-numthreads uconn))) + (let loop ((thnum 1) + (threads '())) + (let ((thlst (cons (make-thread (lambda () + (let work-loop () + (let ((rdat (mailbox-receive! wqueue 24000 'MBOX_TIMEOUT))) + (do-work uconn rdat)) + (work-loop))) + (conc "work thread " thnum)) + threads))) + (if (< thnum numthr) + (loop (+ thnum 1) + thlst) + (begin + (print "ULEX: Starting "(length thlst)" worker threads.") + (map thread-start! thlst) + (print "ULEX: Threads started. Joining all.") + (map thread-join! thlst))))))) + +;; below was to enable re-use of connections. This seems non-trivial so for +;; now lets open on each call +;; +;; ;; given host-port get or create peer struct +;; ;; +;; (define (udat-get-peer uconn host-port) +;; (or (hash-table-ref/default (udat-peers uconn) host-port #f) +;; ;; no peer, so create pdat and init it +;; +;; ;; NEED stack of connections, pop and use; inp, oup, +;; ;; creation_time (remove and create new if over 24hrs old +;; ;; +;; (let ((pdat (make-pdat host-port: host-port))) +;; (hash-table-set! (udat-peers uconn) host-port pdat) +;; pdat))) +;; +;; ;; is pcon alive +;; +;; ;; given host-port and pdat get a pcon +;; ;; +;; (define (pdat-get-pcon pdat host-port) +;; (let loop ((conns (pdat-conns pdat))) +;; (if (null? conns) ;; none? make and return - do NOT add - it will be pushed back on list later +;; (init-pcon (make-pcon)) +;; (let* ((conn (pop conns))) +;; +;; ;; given host-port get a pcon struct +;; ;; +;; (define (udat-get-pcon + +;;====================================================================== +;; misc utils +;;====================================================================== + +(define (make-cookie uconn) + (let ((newcnum (+ (udat-cnum uconn) 1))) + (udat-cnum-set! uconn newcnum) + (conc (udat-host-port uconn) ":" + newcnum))) + +;; cookie/mboxes + +;; we store each mbox with a cookie ( . ) +;; +(define (get-cmbox uconn) + (if (null? (udat-avail-cmboxes uconn)) + (let ((cookie (make-cookie uconn)) + (mbox (make-mailbox))) + (hash-table-set! (udat-mboxes uconn) cookie mbox) + `(,cookie . ,mbox)) + (let ((cmbox (car (udat-avail-cmboxes uconn)))) + (udat-avail-cmboxes-set! uconn (cdr (udat-avail-cmboxes uconn))) + cmbox))) + +(define (put-cmbox uconn cmbox) + (udat-avail-cmboxes-set! uconn (cons cmbox (udat-avail-cmboxes uconn)))) + +(define (pp-uconn uconn) + (pp (udat->alist uconn))) + + ;;====================================================================== ;; network utilities ;;====================================================================== + +;; NOTE: Look at address-info egg as alternative to some of this (define (rate-ip ipaddr) (regex-case ipaddr ( "^127\\..*" _ 0 ) ( "^(10\\.0|192\\.168)\\..*" _ 1 ) @@ -354,1899 +545,26 @@ ;; Change this to bias for addresses with a reasonable broadcast value? ;; (define (ip-pref-less? a b) (> (rate-ip a) (rate-ip b))) - (define (get-my-best-address) - (let ((all-my-addresses (get-all-ips)) - ;;(all-my-addresses-old (vector->list (hostinfo-addresses (hostname->hostinfo (get-host-name))))) - ) + (let ((all-my-addresses (get-all-ips))) (cond ((null? all-my-addresses) (get-host-name)) ;; no interfaces? ((eq? (length all-my-addresses) 1) (car all-my-addresses)) ;; only one to choose from, just go with it - (else - (car (sort all-my-addresses ip-pref-less?))) - ;; (else - ;; (ip->string (car (filter (lambda (x) ;; take any but 127. - ;; (not (eq? (u8vector-ref x 0) 127))) - ;; all-my-addresses)))) - - ))) + (car (sort all-my-addresses ip-pref-less?)))))) (define (get-all-ips-sorted) (sort (get-all-ips) ip-pref-less?)) (define (get-all-ips) - (map ip->string (vector->list - (hostinfo-addresses - (host-information (current-hostname)))))) - -(define (udat-my-host-port udata) - (if (and (udat-my-address udata)(udat-my-port udata)) - (conc (udat-my-address udata) ":" (udat-my-port udata)) - #f)) - -(define (udat-captain-host-port udata) - (if (and (udat-captain-address udata)(udat-captain-port udata)) - (conc (udat-captain-address udata) ":" (udat-captain-port udata)) - #f)) - -(define (udat-get-peer udata host-port) - (hash-table-ref/default (udat-peers udata) host-port #f)) - -;; struct for keeping track of others we are talking to - -(defstruct peer - (addr-port #f) - (hostname #f) - (pid #f) - ;; (inp #f) - ;; (oup #f) - (dbs '()) ;; list of databases this peer is currently handling - ) - -(defstruct work - (peer-dat #f) - (handlerkey #f) - (qrykey #f) - (data #f) - (start (current-milliseconds))) - -#;(defstruct dbowner - (pdat #f) - (last-update (current-seconds))) - -;;====================================================================== -;; Captain functions -;;====================================================================== - -;; NB// This needs to be started in a thread -;; -;; setup to be a captain -;; - local server MUST be started already -;; - create pkt -;; - start server port handler -;; -(define (setup-as-captain udata) - (if (create-captain-pkt udata) - (let* ((my-addr (udat-my-address udata)) - (my-port (udat-my-port udata)) - (th (make-thread (lambda () - (ulex-handler-loop udata)) "Captain handler"))) - (udat-handler-thread-set! udata th) - (udat-captain-address-set! udata my-addr) - (udat-captain-port-set! udata my-port) - (thread-start! th)) - (begin - (print "ERROR: failed to create captain pkt") - #f))) - -;; given a pkts dir read -;; -(define (get-all-captain-pkts udata) - (let* ((pktsdir (let ((d (udat-cpkts-dir udata))) - (if (file-exists? d) - d - (begin - (create-directory d #t) - d)))) - (all-pkt-files (glob (conc pktsdir "/*.pkt"))) - (pkt-spec (udat-cpkt-spec udata))) - (map (lambda (pkt-file) - (read-pkt->alist pkt-file pktspec: pkt-spec)) - all-pkt-files))) - -;; sort by D then Z, return one, choose the oldest then -;; differentiate if needed using the Z key -;;l -(define (get-winning-pkt pkts) - (if (null? pkts) - #f - (car (sort pkts (lambda (a b) - (let ((ad (string->number (alist-ref 'D a))) - (bd (string->number (alist-ref 'D b)))) - (if (eq? a b) - (let ((az (alist-ref 'Z a)) - (bz (alist-ref 'Z b))) - (string>=? az bz)) - (> ad bd)))))))) - -;; put the host, ip, port and pid into a pkt in -;; the captain pkts dir -;; - assumes user has already fired up a server -;; which will be in the udata struct -;; -(define (create-captain-pkt udata) - (if (not (udat-serv-listener udata)) - (begin - (print "ERROR: create-captain-pkt called with out a listener") - #f) - (let* ((pktdat `((port . ,(udat-my-port udata)) - (host . ,(udat-my-hostname udata)) - (ipaddr . ,(udat-my-address udata)) - (pid . ,(udat-my-pid udata)))) - (pktdir (udat-cpkts-dir udata)) - (pktspec (udat-cpkt-spec udata)) - ) - (udat-my-cpkt-key-set! - udata - (write-alist->pkt - pktdir - pktdat - pktspec: pktspec - ptype: 'captain)) - (udat-my-cpkt-key udata)))) - -;; remove pkt associated with captn (the Z key .pkt) -;; -(define (remove-captain-pkt udata captn) - (let ((Z (alist-ref 'Z captn)) - (cpktdir (udat-cpkts-dir udata))) - (delete-file* (conc cpktdir "/" Z ".pkt")))) - -;; call all known peers and tell them to delete their info on the captain -;; thus forcing them to re-read pkts and connect to a new captain -;; call this when the captain needs to exit and if an older captain is -;; detected. Due to delays in sending file meta data in NFS multiple -;; captains can be initiated in a "Storm of Captains", book soon to be -;; on Amazon -;; -(define (drop-captain udata) - (let* ((peers (hash-table-keys (udat-peers udata))) - (cookie (make-cookie udata))) - (for-each - (lambda (host-port) - (send udata host-port 'dropcaptain cookie "nomsg" retval: #t)) - peers))) - -;;====================================================================== -;; server primitives -;;====================================================================== - -(define (make-cookie udata) - (let ((newcnum (+ (udat-cnum udata) 1))) - (udat-cnum-set! udata newcnum) - (conc (udat-my-address udata) ":" - (udat-my-port udata) "-" - (udat-my-pid udata) "-" - newcnum))) - -;; create a tcp listener and return a populated udat struct with -;; my port, address, hostname, pid etc. -;; return #f if fail to find a port to allocate. -;; -;; if udata-in is #f create the record -;; if there is already a serv-listener return the udata -;; -(define (start-server-find-port udata-in #!optional (port 4242)) - (let ((udata (or udata-in (make-udat)))) - (if (udat-serv-listener udata) ;; TODO - add check that the listener is alive and ready? - udata - (handle-exceptions - exn - (if (< port 65535) - (start-server-find-port udata (+ port 1)) - #f) - (connect-server udata port))))) - -(define (connect-server udata port) - ;; (tcp-listener-socket LISTENER)(socket-name so) - ;; sockaddr-address, sockaddr-port, sockaddr->string - (let* ((tlsn (tcp-listen port 1000 #f)) ;; (tcp-listen TCPPORT [BACKLOG [HOST]]) - (addr (get-my-best-address))) ;; (hostinfo-addresses (host-information (current-hostname))) - (udat-my-address-set! udata addr) - (udat-my-port-set! udata port) - (udat-my-hostname-set! udata (get-host-name)) - (udat-serv-listener-set! udata tlsn) - udata)) - -(define (get-peer-dat udata host-port #!optional (hostname #f)(pid #f)) - (let* ((pdat (or (udat-get-peer udata host-port) - (handle-exceptions ;; ERROR - MAKE THIS EXCEPTION HANDLER MORE SPECIFIC - exn - #f - (let ((npdat (make-peer addr-port: host-port))) - (if hostname (peer-hostname-set! npdat hostname)) - (if pid (peer-pid-set! npdat pid)) - npdat))))) - pdat)) - -;; send structured data to recipient -;; -;; NOTE: qrykey is what was called the "cookie" previously -;; -;; retval tells send to expect and wait for return data (one line) and return it or time out -;; this is for ping where we don't want to necessarily have set up our own server yet. -;; -(define (send udata host-port handler qrykey data - #!key (hostname #f)(pid #f)(params '())(retval #f)) - (let* ((my-host-port (udat-my-host-port udata)) - (isme (equal? host-port my-host-port)) ;; am I calling - ;; myself? - (dat (list - handler ;; " " - my-host-port ;; " " - (udat-my-pid udata) ;; " " - qrykey - params ;;(if (null? params) "" (conc " " - ;;(string-intersperse params " "))) - ))) - ;; (print "send isme is " (if isme "true!" "false!") ", - ;; my-host-port: " my-host-port ", host-port: " host-port) - (if isme - (ulex-handler udata dat data) - (handle-exceptions ;; ERROR - MAKE THIS EXCEPTION HANDLER MORE - ;; SPECIFIC - exn - #f - (let-values (((inp oup)(tcp-connect host-port))) - ;; - ;; CONTROL LINE: - ;; handlerkey host:port pid qrykey params ... - ;; - (let ((res - (if (and inp oup) - (let* () - (if my-host-port - (begin - (write dat oup) - (write data oup) ;; send as sexpr - ;; (print "Sent dat: " dat " data: " data) - (if retval - (read inp) - #t)) - (begin - (print "ERROR: send called but no receiver has been setup. Please call setup first!") - #f)) - ;; NOTE: DO NOT BE TEMPTED TO LOOK AT ANY DATA ON INP HERE! - ;; (there is a listener for handling that) - ) - #f))) ;; #f means failed to connect and send - (close-input-port inp) - (close-output-port oup) - res)))))) - -;; send a request to the given host-port and register a mailbox in udata -;; wait for the mailbox data and return it -;; -(define (send-receive udata host-port handler qrykey data #!key (hostname #f)(pid #f)(params '())(timeout 20)) - (let ((mbox (make-mailbox)) - (mbox-time (current-milliseconds)) - (mboxes (udat-mboxes udata))) - (hash-table-set! mboxes qrykey mbox) - (if (send udata host-port handler qrykey data hostname: hostname pid: pid params: params) - (let* ((mbox-timeout-secs timeout) - (mbox-timeout-result 'MBOX_TIMEOUT) - (res (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result)) - (mbox-receive-time (current-milliseconds))) - (hash-table-delete! mboxes qrykey) - (if (eq? res 'MBOX_TIMEOUT) - #f - res)) - #f))) ;; #f means failed to communicate - -;; -(define (ulex-handler udata controldat data) - (print "controldat: " controldat " data: " data) - (match controldat ;; (string-split controldat) - ((handlerkey host-port pid qrykey params ...) - ;; (print "handlerkey: " handlerkey " host-port: " host-port " pid: " pid " qrykey: " qrykey " params: " params) - (case handlerkey ;; (string->symbol handlerkey) - ((ack)(print "Got ack!")) - ((ping) ;; special case - return result immediately on the same connection - (let* ((proc (hash-table-ref/default (udat-handlers udata) 'ping #f)) - (val (if proc (proc) "gotping")) - (peer (make-peer addr-port: host-port pid: pid)) - (dbshash (udat-dbowners udata))) - (peer-dbs-set! peer params) ;; params for ping is list of dbs owned by pinger - (for-each (lambda (dbfile) - (hash-table-set! dbshash dbfile host-port)) ;; WRONG? - params) ;; register each db in the dbshash - (if (not (hash-table-exists? (udat-peers udata) host-port)) - (hash-table-set! (udat-peers udata) host-port peer)) ;; save the details of this caller in peers - qrykey)) ;; End of ping - ((goodbye) - ;; remove all traces of the caller in db ownership etc. - (let* ((peer (hash-table-ref/default (udat-peers udata) host-port #f)) - (dbs (if peer (peer-dbs peer) '())) - (dbshash (udat-dbowners udata))) - (for-each (lambda (dbfile)(hash-table-delete! dbshash dbfile)) dbs) - (hash-table-delete! (udat-peers udata) host-port) - qrykey)) - ((dropcaptain) - ;; remove all traces of the captain - (udat-captain-address-set! udata #f) - (udat-captain-host-set! udata #f) - (udat-captain-port-set! udata #f) - (udat-captain-pid-set! udata #f) - qrykey) - ((rucaptain) ;; remote is asking if I'm the captain - (if (udat-my-cpkt-key udata) "yes" "no")) - ((db-owner) ;; given a db name who do I send my queries to - ;; look up the file in handlers, if have an entry ping them to be sure - ;; they are still alive and then return that host:port. - ;; if no handler found or if the ping fails pick from peers the oldest that - ;; is managing the fewest dbs - (match params - ((dbfile dbtype) - (let* ((owner-host-port (hash-table-ref/default (udat-dbowners udata) dbfile #f))) - (if owner-host-port - (conc qrykey " " owner-host-port) - (let* ((pdat (or (hash-table-ref/default (udat-peers udata) host-port #f) ;; no owner - caller gets to own it! - (make-peer addr-port: host-port pid: pid dbs: `(,dbfile))))) - (hash-table-set! (udat-peers udata) host-port pdat) - (hash-table-set! (udat-dbowners udata) dbfile host-port) - (conc qrykey " " host-port))))) - (else (conc qrykey " BADDATA")))) - ;; for work items: - ;; handler is one of; immediate, read-only, read-write, high-priority - ((immediate read-only normal low-priority) ;; do this work immediately - ;; host-port (caller), pid (caller), qrykey (cookie), params <= all from first line - ;; data => a single line encoded however you want, or should I build json into it? - (print "handlerkey=" handlerkey) - (let* ((pdat (get-peer-dat udata host-port))) - (match params ;; dbfile prockey procparam - ((dbfile prockey procparam) - (case handlerkey - ((immediate read-only) - (process-request udata pdat dbfile qrykey prockey procparam data)) - ((normal low-priority) ;; split off later and add logic to support low priority - (add-to-work-queue udata pdat dbfile qrykey prockey procparam data)) - (else - #f))) - (else - (print "INFO: params=" params " handlerkey=" handlerkey " controldat=" controldat) - #f)))) - (else - ;; (add-to-work-queue udata (get-peer-dat udata host-port) handlerkey qrykey data) - #f))) - (else - (print "BAD DATA? controldat=" controldat " data=" data) - #f)));; handles the incoming messages and dispatches to queues - -;; -(define (ulex-handler-loop udata) - (let* ((serv-listener (udat-serv-listener udata))) - ;; data comes as two lines - ;; handlerkey resp-addr:resp-port hostname pid qrykey [dbpath/dbfile.db] - ;; data - (let loop ((state 'start)) - (let-values (((inp oup)(tcp-accept serv-listener))) - (let* ((controldat (read inp)) - (data (read inp)) - (resp (ulex-handler udata controldat data))) - (if resp (write resp oup)) - (close-input-port inp) - (close-output-port oup)) - (loop state))))) - -;; add a proc to the handler list, these are done symetrically (i.e. in all instances) -;; so that the proc can be dereferenced remotely -;; -(define (register-handler udata key proc) - (hash-table-set! (udat-handlers udata) key proc)) - - -;;====================================================================== -;; work queues -;;====================================================================== - -(define (add-to-work-queue udata peer-dat handlerkey qrykey data) - (let ((wdat (make-work peer-dat: peer-dat handlerkey: handlerkey qrykey: qrykey data: data))) - (if (udat-busy udata) - (queue-add! (udat-work-queue udata) wdat) - (process-work udata wdat)) ;; passing in wdat tells process-work to first process the passed in wdat - )) - -(define (do-work udata wdat) - #f) - -(define (process-work udata #!optional wdat) - (if wdat (do-work udata wdat)) ;; process wdat - (let ((wqueue (udat-work-queue udata))) - (if (not (queue-empty? wqueue)) - (let loop ((wd (queue-remove! wqueue))) - (do-work udata wd) - (if (not (queue-empty? wqueue)) - (loop (queue-remove! wqueue))))))) - -;;====================================================================== -;; Generic db handling -;; setup a inmem db instance -;; open connection to on-disk db -;; sync on-disk db to inmem -;; get lock in on-disk db for dbowner of this db -;; put sync-proc, init-proc, on-disk handle, inmem handle in dbconn stuct -;; return the stuct -;;====================================================================== - -(defstruct dbconn - (fname #f) - (inmem #f) - (conn #f) - (sync #f) ;; sync proc - (init #f) ;; init proc - (lastsync (current-seconds)) - ) - -(defstruct dbinfo - (initproc #f) - (syncproc #f)) - -;; open inmem and disk database -;; init with initproc -;; return db struct -;; -;; appname; megatest, ulex or something else. -;; -(define (setup-db-connection udata fname-in appname dbtype) - (let* ((is-ulex (eq? appname 'ulex)) - (dbinf (if is-ulex ;; ulex is a built-in special case - (make-dbinfo initproc: ulexdb-init syncproc: ulexdb-sync) - (hash-table-ref/default (udat-dbtypes udata) dbtype #f))) - (initproc (dbinfo-initproc dbinf)) - (syncproc (dbinfo-syncproc dbinf)) - (fname (if is-ulex - (conc (udat-ulex-dir udata) "/ulex.db") - fname-in)) - (inmem-db (open-and-initdb udata #f 'inmem (dbinfo-initproc dbinf))) - (disk-db (open-and-initdb udata fname 'disk (dbinfo-initproc dbinf)))) - (make-dbconn inmem: inmem-db conn: disk-db sync: syncproc init: initproc))) - -;; dest='inmem or 'disk -;; -(define (open-and-initdb udata filename dest init-proc) - (let* ((inmem (eq? dest 'inmem)) - (dbfile (if inmem - ":INMEM:" - filename)) - (dbexists (if inmem #t (file-exists? dbfile))) - (db (sqlite3:open-database dbfile))) - (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) - (if (not dbexists) - (init-proc db)) - db)) - - -;;====================================================================== -;; Previous Ulex db stuff -;;====================================================================== - -(define (ulexdb-init db inmem) - (sqlite3:with-transaction - db - (lambda () - (for-each - (lambda (stmt) - (if stmt (sqlite3:execute db stmt))) - `("CREATE TABLE IF NOT EXISTS processes - (id INTEGER PRIMARY KEY, - host TEXT NOT NULL, - ipadr TEXT NOT NULL, - port INTEGER NOT NULL, - pid INTEGER NOT NULL, - regtime INTEGER DEFAULT (strftime('%s','now')), - last_update INTEGER DEFAULT (strftime('%s','now')));" - (if inmem - "CREATE TRIGGER IF NOT EXISTS update_proces_trigger AFTER UPDATE ON processes - FOR EACH ROW - BEGIN - UPDATE processes SET last_update=(strftime('%s','now')) - WHERE id=old.id; - END;" - #f)))))) - -;; open databases, do initial sync -(define (ulexdb-sync dbconndat udata) - #f) - - -) ;; END OF ULEX - - -;;; ;;====================================================================== -;;; ;; D E B U G H E L P E R S -;;; ;;====================================================================== -;;; -;;; (define (dbg> . args) -;;; (with-output-to-port (current-error-port) -;;; (lambda () -;;; (apply print "dbg> " args)))) -;;; -;;; (define (debug-pp . args) -;;; (if (get-environment-variable "ULEX_DEBUG") -;;; (with-output-to-port (current-error-port) -;;; (lambda () -;;; (apply pp args))))) -;;; -;;; (define *default-debug-port* (current-error-port)) -;;; -;;; (define (sdbg> fn stage-name stage-start stage-end start-time . message) -;;; (if (get-environment-variable "ULEX_DEBUG") -;;; (with-output-to-port *default-debug-port* -;;; (lambda () -;;; (apply print "ulex:" fn " " stage-name " took " (- (if stage-end stage-end (current-milliseconds)) stage-start) " ms. " -;;; (if start-time -;;; (conc "total time " (- (current-milliseconds) start-time) -;;; " ms.") -;;; "") -;;; message -;;; ))))) - -;;====================================================================== -;; M A C R O S -;;====================================================================== -;; iup callbacks are not dumping the stack, this is a work-around -;; - -;; Some of these routines use: -;; -;; http://www.cs.toronto.edu/~gfb/scheme/simple-macros.html -;; -;; Syntax for defining macros in a simple style similar to function definiton, -;; when there is a single pattern for the argument list and there are no keywords. -;; -;; (define-simple-syntax (name arg ...) body ...) -;; -;; -;; (define-syntax define-simple-syntax -;; (syntax-rules () -;; ((_ (name arg ...) body ...) -;; (define-syntax name (syntax-rules () ((name arg ...) (begin body ...))))))) -;; -;; (define-simple-syntax (catch-and-dump proc procname) -;; (handle-exceptions -;; exn -;; (begin -;; (print-call-chain (current-error-port)) -;; (with-output-to-port (current-error-port) -;; (lambda () -;; (print ((condition-property-accessor 'exn 'message) exn)) -;; (print "Callback error in " procname) -;; (print "Full condition info:\n" (condition->list exn))))) -;; (proc))) -;; -;; -;;====================================================================== -;; R E C O R D S -;;====================================================================== - -;;; ;; information about me as a server -;;; ;; -;;; (defstruct area -;;; ;; about this area -;;; (useportlogger #f) -;;; (lowport 32768) -;;; (server-type 'auto) ;; auto=create up to five servers/pkts, main=create pkts, passive=no pkt (unless there are no pkts at all) -;;; (conn #f) -;;; (port #f) -;;; (myaddr (get-my-best-address)) -;;; pktid ;; get pkt from hosts table if needed -;;; pktfile -;;; pktsdir -;;; dbdir -;;; (dbhandles (make-hash-table)) ;; fname => list-of-dbh, NOTE: Should really never need more than one? -;;; (mutex (make-mutex)) -;;; (rtable (make-hash-table)) ;; registration table of available actions -;;; (dbs (make-hash-table)) ;; filename => random number, used for choosing what dbs I serve -;;; ;; about other servers -;;; (hosts (make-hash-table)) ;; key => hostdat -;;; (hoststats (make-hash-table)) ;; key => alist of fname => ( qcount . qtime ) -;;; (reqs (make-hash-table)) ;; uri => queue -;;; ;; work queues -;;; (wqueues (make-hash-table)) ;; fname => qdat -;;; (stats (make-hash-table)) ;; fname => totalqueries -;;; (last-srvup (current-seconds)) ;; last time we updated the known servers -;;; (cookie2mbox (make-hash-table)) ;; map cookie for outstanding request to mailbox of awaiting call -;;; (ready #f) -;;; (health (make-hash-table)) ;; ipaddr:port => num failed pings since last good ping -;;; ) -;;; -;;; ;; host stats -;;; ;; -;;; (defstruct hostdat -;;; (pkt #f) -;;; (dbload (make-hash-table)) ;; "dbfile.db" => queries/min -;;; (hostload #f) ;; normalized load ( 5min load / numcpus ) -;;; ) -;;; -;;; ;; dbdat -;;; ;; -;;; (defstruct dbdat -;;; (dbh #f) -;;; (fname #f) -;;; (write-access #f) -;;; (sths (make-hash-table)) ;; hash mapping query strings to handles -;;; ) -;;; -;;; ;; qdat -;;; ;; -;;; (defstruct qdat -;;; (writeq (make-queue)) -;;; (readq (make-queue)) -;;; (rwq (make-queue)) -;;; (logq (make-queue)) ;; do we need a queue for logging? yes, if we use sqlite3 db for logging -;;; (osshort (make-queue)) -;;; (oslong (make-queue)) -;;; (misc (make-queue)) ;; used for things like ping-full -;;; ) -;;; -;;; ;; calldat -;;; ;; -;;; (defstruct calldat -;;; (ctype 'dbwrite) -;;; (obj #f) ;; this would normally be an SQL statement e.g. SELECT, INSERT etc. -;;; (rtime (current-milliseconds))) -;;; -;;; ;; make it a global? Well, it is local to area module -;;; -;;; (define *pktspec* -;;; `((server (hostname . h) -;;; (port . p) -;;; (pid . i) -;;; (ipaddr . a) -;;; ) -;;; (data (hostname . h) ;; sender hostname -;;; (port . p) ;; sender port -;;; (ipaddr . a) ;; sender ip -;;; (hostkey . k) ;; sending host key - store info at server under this key -;;; (servkey . s) ;; server key - this needs to match at server end or reject the msg -;;; (format . f) ;; sb=serialized-base64, t=text, sx=sexpr, j=json -;;; (data . d) ;; base64 encoded slln data -;;; ))) -;;; -;;; ;; work item -;;; ;; -;;; (defstruct witem -;;; (rhost #f) ;; return host -;;; (ripaddr #f) ;; return ipaddr -;;; (rport #f) ;; return port -;;; (servkey #f) ;; the packet representing the client of this workitem, used by final send-message -;;; (rdat #f) ;; the request - usually an sql query, type is rdat -;;; (action #f) ;; the action: immediate, dbwrite, dbread,oslong, osshort -;;; (cookie #f) ;; cookie id for response -;;; (data #f) ;; the data payload, i.e. parameters -;;; (result #f) ;; the result from processing the data -;;; (caller #f)) ;; the calling peer according to rpc itself -;;; -;;; (define (trim-pktid pktid) -;;; (if (string? pktid) -;;; (substring pktid 0 4) -;;; "nopkt")) -;;; -;;; (define (any->number num) -;;; (cond -;;; ((number? num) num) -;;; ((string? num) (string->number num)) -;;; (else num))) -;;; -;;; (use trace) -;;; (trace-call-sites #t) -;;; -;;; ;;====================================================================== -;;; ;; D A T A B A S E H A N D L I N G -;;; ;;====================================================================== -;;; -;;; ;; look in dbhandles for a db, return it, else return #f -;;; ;; -;;; (define (get-dbh acfg fname) -;;; (let ((dbh-lst (hash-table-ref/default (area-dbhandles acfg) fname '()))) -;;; (if (null? dbh-lst) -;;; (begin -;;; ;; (print "opening db for " fname) -;;; (open-db acfg fname)) ;; Note that the handles get put back in the queue in the save-dbh calls -;;; (let ((rem-lst (cdr dbh-lst))) -;;; ;; (print "re-using saved connection for " fname) -;;; (hash-table-set! (area-dbhandles acfg) fname rem-lst) -;;; (car dbh-lst))))) -;;; -;;; (define (save-dbh acfg fname dbdat) -;;; ;; (print "saving dbh for " fname) -;;; (hash-table-set! (area-dbhandles acfg) fname (cons dbdat (hash-table-ref/default (area-dbhandles acfg) fname '())))) -;;; -;;; ;; open the database, if never before opened init it. put the handle in the -;;; ;; open db's hash table -;;; ;; returns: the dbdat -;;; ;; -;;; (define (open-db acfg fname) -;;; (let* ((fullname (conc (area-dbdir acfg) "/" fname)) -;;; (exists (file-exists? fullname)) -;;; (write-access (if exists -;;; (file-write-access? fullname) -;;; (file-write-access? (area-dbdir acfg)))) -;;; (db (sqlite3:open-database fullname)) -;;; (handler (sqlite3:make-busy-timeout 136000)) -;;; ) -;;; (sqlite3:set-busy-handler! db handler) -;;; (sqlite3:execute db "PRAGMA synchronous = 0;") -;;; (if (not exists) ;; need to init the db -;;; (if write-access -;;; (let ((isql (get-rsql acfg 'dbinitsql))) ;; get the init sql statements -;;; ;; (sqlite3:with-transaction -;;; ;; db -;;; ;; (lambda () -;;; (if isql -;;; (for-each -;;; (lambda (sql) -;;; (sqlite3:execute db sql)) -;;; isql))) -;;; (print "ERROR: no write access to " (area-dbdir acfg)))) -;;; (make-dbdat dbh: db fname: fname write-access: write-access))) -;;; -;;; ;; This is a low-level command to retrieve or to prepare, save and return a prepared statment -;;; ;; you must extract the db handle -;;; ;; -;;; (define (get-sth db cache stmt) -;;; (if (hash-table-exists? cache stmt) -;;; (begin -;;; ;; (print "Reusing cached stmt for " stmt) -;;; (hash-table-ref/default cache stmt #f)) -;;; (let ((sth (sqlite3:prepare db stmt))) -;;; (hash-table-set! cache stmt sth) -;;; ;; (print "prepared stmt for " stmt) -;;; sth))) -;;; -;;; ;; a little more expensive but does all the tedious deferencing - only use if you don't already -;;; ;; have dbdat and db sitting around -;;; ;; -;;; (define (full-get-sth acfg fname stmt) -;;; (let* ((dbdat (get-dbh acfg fname)) -;;; (db (dbdat-dbh dbdat)) -;;; (sths (dbdat-sths dbdat))) -;;; (get-sth db sths stmt))) -;;; -;;; ;; write to a db -;;; ;; acfg: area data -;;; ;; rdat: request data -;;; ;; hdat: (host . port) -;;; ;; -;;; ;; (define (dbwrite acfg rdat hdat data-in) -;;; ;; (let* ((dbname (car data-in)) -;;; ;; (dbdat (get-dbh acfg dbname)) -;;; ;; (db (dbdat-dbh dbdat)) -;;; ;; (sths (dbdat-sths dbdat)) -;;; ;; (stmt (calldat-obj rdat)) -;;; ;; (sth (get-sth db sths stmt)) -;;; ;; (data (cdr data-in))) -;;; ;; (print "dbname: " dbname " acfg: " acfg " rdat: " (calldat->alist rdat) " hdat: " hdat " data: " data) -;;; ;; (print "dbdat: " (dbdat->alist dbdat)) -;;; ;; (apply sqlite3:execute sth data) -;;; ;; (save-dbh acfg dbname dbdat) -;;; ;; #t -;;; ;; )) -;;; -;;; (define (finalize-all-db-handles acfg) -;;; (let* ((dbhandles (area-dbhandles acfg)) ;; dbhandles is hash of fname ==> dbdat -;;; (num 0)) -;;; (for-each -;;; (lambda (area-name) -;;; (print "Closing handles for " area-name) -;;; (let ((dbdats (hash-table-ref/default dbhandles area-name '()))) -;;; (for-each -;;; (lambda (dbdat) -;;; ;; first close all statement handles -;;; (for-each -;;; (lambda (sth) -;;; (sqlite3:finalize! sth) -;;; (set! num (+ num 1))) -;;; (hash-table-values (dbdat-sths dbdat))) -;;; ;; now close the dbh -;;; (set! num (+ num 1)) -;;; (sqlite3:finalize! (dbdat-dbh dbdat))) -;;; dbdats))) -;;; (hash-table-keys dbhandles)) -;;; (print "FINALIZED " num " dbhandles"))) -;;; -;;; ;;====================================================================== -;;; ;; W O R K Q U E U E H A N D L I N G -;;; ;;====================================================================== -;;; -;;; (define (register-db-as-mine acfg dbname) -;;; (let ((ht (area-dbs acfg))) -;;; (if (not (hash-table-ref/default ht dbname #f)) -;;; (hash-table-set! ht dbname (random 10000))))) -;;; -;;; (define (work-queue-add acfg fname witem) -;;; (let* ((work-queue-start (current-milliseconds)) -;;; (action (witem-action witem)) ;; NB the action is the index into the rdat actions -;;; (qdat (or (hash-table-ref/default (area-wqueues acfg) fname #f) -;;; (let ((newqdat (make-qdat))) -;;; (hash-table-set! (area-wqueues acfg) fname newqdat) -;;; newqdat))) -;;; (rdat (hash-table-ref/default (area-rtable acfg) action #f))) -;;; (if rdat -;;; (queue-add! -;;; (case (calldat-ctype rdat) -;;; ((dbwrite) (register-db-as-mine acfg fname)(qdat-writeq qdat)) -;;; ((dbread) (register-db-as-mine acfg fname)(qdat-readq qdat)) -;;; ((dbrw) (register-db-as-mine acfg fname)(qdat-rwq qdat)) -;;; ((oslong) (qdat-oslong qdat)) -;;; ((osshort) (qdat-osshort qdat)) -;;; ((full-ping) (qdat-misc qdat)) -;;; (else -;;; (print "ERROR: no queue for " action ". Adding to dbwrite queue.") -;;; (qdat-writeq qdat))) -;;; witem) -;;; (case action -;;; ((full-ping)(qdat-misc qdat)) -;;; (else -;;; (print "ERROR: No action " action " was registered")))) -;;; (sdbg> "work-queue-add" "queue-add" work-queue-start #f #f) -;;; #t)) ;; for now, simply return #t to indicate request got to the queue -;;; -;;; (define (doqueue acfg q fname dbdat dbh) -;;; ;; (print "doqueue: " fname) -;;; (let* ((start-time (current-milliseconds)) -;;; (qlen (queue-length q))) -;;; (if (> qlen 1) -;;; (print "Processing queue of length " qlen)) -;;; (let loop ((count 0) -;;; (responses '())) -;;; (let ((delta (- (current-milliseconds) start-time))) -;;; (if (or (queue-empty? q) -;;; (> delta 400)) ;; stop working on this queue after 400ms have passed -;;; (list count delta responses) ;; return count, delta and responses list -;;; (let* ((witem (queue-remove! q)) -;;; (action (witem-action witem)) -;;; (rdat (witem-rdat witem)) -;;; (stmt (calldat-obj rdat)) -;;; (sth (full-get-sth acfg fname stmt)) -;;; (ctype (calldat-ctype rdat)) -;;; (data (witem-data witem)) -;;; (cookie (witem-cookie witem))) -;;; ;; do the processing and save the result in witem-result -;;; (witem-result-set! -;;; witem -;;; (case ctype ;; action -;;; ((noblockwrite) ;; blind write, no ack of success returned -;;; (apply sqlite3:execute sth data) -;;; (sqlite3:last-insert-rowid dbh)) -;;; ((dbwrite) ;; blocking write -;;; (apply sqlite3:execute sth data) -;;; #t) -;;; ((dbread) ;; TODO: consider breaking this up and shipping in pieces for large query -;;; (apply sqlite3:map-row (lambda x x) sth data)) -;;; ((full-ping) 'full-ping) -;;; (else (print "Not ready for action " action) #f))) -;;; (loop (add1 count) -;;; (if cookie -;;; (cons witem responses) -;;; responses)))))))) -;;; -;;; ;; do up to 400ms of processing on each queue -;;; ;; - the work-queue-processor will allow the max 1200ms of work to complete but it will flag as overloaded -;;; ;; -;;; (define (process-db-queries acfg fname) -;;; (if (hash-table-exists? (area-wqueues acfg) fname) -;;; (let* ((process-db-queries-start-time (current-milliseconds)) -;;; (qdat (hash-table-ref/default (area-wqueues acfg) fname #f)) -;;; (queue-sym->queue (lambda (queue-sym) -;;; (case queue-sym ;; lookup the queue from qdat given a name (symbol) -;;; ((wqueue) (qdat-writeq qdat)) -;;; ((rqueue) (qdat-readq qdat)) -;;; ((rwqueue) (qdat-rwq qdat)) -;;; ((misc) (qdat-misc qdat)) -;;; (else #f)))) -;;; (dbdat (get-dbh acfg fname)) -;;; (dbh (if (dbdat? dbdat)(dbdat-dbh dbdat) #f)) -;;; (nowtime (current-seconds))) -;;; ;; handle the queues that require a transaction -;;; ;; -;;; (map ;; -;;; (lambda (queue-sym) -;;; ;; (print "processing queue " queue-sym) -;;; (let* ((queue (queue-sym->queue queue-sym))) -;;; (if (not (queue-empty? queue)) -;;; (let ((responses -;;; (sqlite3:with-transaction ;; todo - catch exceptions... -;;; dbh -;;; (lambda () -;;; (let* ((res (doqueue acfg queue fname dbdat dbh))) ;; this does the work! -;;; ;; (print "res=" res) -;;; (match res -;;; ((count delta responses) -;;; (update-stats acfg fname queue-sym delta count) -;;; (sdbg> "process-db-queries" "sqlite3-transaction" process-db-queries-start-time #f #f) -;;; responses) ;; return responses -;;; (else -;;; (print "ERROR: bad return data from doqueue " res))) -;;; ))))) -;;; ;; having completed the transaction, send the responses. -;;; ;; (print "INFO: sending " (length responses) " responses.") -;;; (let loop ((responses-left responses)) -;;; (cond -;;; ((null? responses-left) #t) -;;; (else -;;; (let* ((witem (car responses-left)) -;;; (response (cdr responses-left))) -;;; (call-deliver-response acfg (witem-ripaddr witem)(witem-rport witem) -;;; (witem-cookie witem)(witem-result witem))) -;;; (loop (cdr responses-left)))))) -;;; ))) -;;; '(wqueue rwqueue rqueue)) -;;; -;;; ;; handle misc queue -;;; ;; -;;; ;; (print "processing misc queue") -;;; (let ((queue (queue-sym->queue 'misc))) -;;; (doqueue acfg queue fname dbdat dbh)) -;;; ;; .... -;;; (save-dbh acfg fname dbdat) -;;; #t ;; just to let the tests know we got here -;;; ) -;;; #f ;; nothing processed -;;; )) -;;; -;;; ;; run all queues in parallel per db but sequentially per queue for that db. -;;; ;; - process the queues every 500 or so ms -;;; ;; - allow for long running queries to continue but all other activities for that -;;; ;; db will be blocked. -;;; ;; -;;; (define (work-queue-processor acfg) -;;; (let* ((threads (make-hash-table))) ;; fname => thread -;;; (let loop ((fnames (hash-table-keys (area-wqueues acfg))) -;;; (target-time (+ (current-milliseconds) 50))) -;;; ;;(if (not (null? fnames))(print "Processing for these databases: " fnames)) -;;; (for-each -;;; (lambda (fname) -;;; ;; (print "processing for " fname) -;;; ;;(process-db-queries acfg fname)) -;;; (let ((th (hash-table-ref/default threads fname #f))) -;;; (if (and th (not (member (thread-state th) '(dead terminated)))) -;;; (begin -;;; (print "WARNING: worker thread for " fname " is taking a long time.") -;;; (print "Thread is in state " (thread-state th))) -;;; (let ((th1 (make-thread (lambda () -;;; ;; (catch-and-dump -;;; ;; (lambda () -;;; ;; (print "Process queries for " fname) -;;; (let ((start-time (current-milliseconds))) -;;; (process-db-queries acfg fname) -;;; ;; (thread-sleep! 0.01) ;; need the thread to take at least some time -;;; (hash-table-delete! threads fname)) ;; no mutexes? -;;; fname) -;;; "th1"))) ;; )) -;;; (hash-table-set! threads fname th1) -;;; (thread-start! th1))))) -;;; fnames) -;;; ;; (thread-sleep! 0.1) ;; give the threads some time to process requests -;;; ;; burn time until 400ms is up -;;; (let ((now-time (current-milliseconds))) -;;; (if (< now-time target-time) -;;; (let ((delta (- target-time now-time))) -;;; (thread-sleep! (/ delta 1000))))) -;;; (loop (hash-table-keys (area-wqueues acfg)) -;;; (+ (current-milliseconds) 50))))) -;;; -;;; ;;====================================================================== -;;; ;; S T A T S G A T H E R I N G -;;; ;;====================================================================== -;;; -;;; (defstruct stat -;;; (qcount-avg 0) ;; coarse running average -;;; (qtime-avg 0) ;; coarse running average -;;; (qcount 0) ;; total -;;; (qtime 0) ;; total -;;; (last-qcount 0) ;; last -;;; (last-qtime 0) ;; last -;;; (dbs '()) ;; list of db files handled by this node -;;; (when 0)) ;; when the last query happened - seconds -;;; -;;; -;;; (define (update-stats acfg fname bucket duration numqueries) -;;; (let* ((key fname) ;; for now do not use bucket. Was: (conc fname "-" bucket)) ;; lazy but good enough -;;; (stats (or (hash-table-ref/default (area-stats acfg) key #f) -;;; (let ((newstats (make-stat))) -;;; (hash-table-set! (area-stats acfg) key newstats) -;;; newstats)))) -;;; ;; when the last query happended (used to remove the fname from the active list) -;;; (stat-when-set! stats (current-seconds)) -;;; ;; last values -;;; (stat-last-qcount-set! stats numqueries) -;;; (stat-last-qtime-set! stats duration) -;;; ;; total over process lifetime -;;; (stat-qcount-set! stats (+ (stat-qcount stats) numqueries)) -;;; (stat-qtime-set! stats (+ (stat-qtime stats) duration)) -;;; ;; coarse average -;;; (stat-qcount-avg-set! stats (/ (+ (stat-qcount-avg stats) numqueries) 2)) -;;; (stat-qtime-avg-set! stats (/ (+ (stat-qtime-avg stats) duration) 2)) -;;; -;;; ;; here is where we add the stats for a given dbfile -;;; (if (not (member fname (stat-dbs stats))) -;;; (stat-dbs-set! stats (cons fname (stat-dbs stats)))) -;;; -;;; )) -;;; -;;; ;;====================================================================== -;;; ;; S E R V E R S T U F F -;;; ;;====================================================================== -;;; -;;; ;; this does NOT return! -;;; ;; -;;; (define (find-free-port-and-open acfg) -;;; (let ((port (or (area-port acfg) 3200))) -;;; (handle-exceptions -;;; exn -;;; (begin -;;; (print "INFO: cannot bind to port " (rpc:default-server-port) ", trying next port") -;;; (area-port-set! acfg (+ port 1)) -;;; (find-free-port-and-open acfg)) -;;; (rpc:default-server-port port) -;;; (area-port-set! acfg port) -;;; (tcp-read-timeout 120000) -;;; ;; ((rpc:make-server (tcp-listen port)) #t) -;;; (tcp-listen (rpc:default-server-port) -;;; )))) -;;; -;;; ;; register this node by putting a packet into the pkts dir. -;;; ;; look for other servers -;;; ;; contact other servers and compile list of servers -;;; ;; there are two types of server -;;; ;; main servers - dashboards, runners and dedicated servers - need pkt -;;; ;; passive servers - test executers, step calls, list-runs - no pkt -;;; ;; -;;; (define (register-node acfg hostip port-num) -;;; ;;(mutex-lock! (area-mutex acfg)) -;;; (let* ((server-type (area-server-type acfg)) ;; auto, main, passive (no pkt created) -;;; (best-ip (or hostip (get-my-best-address))) -;;; (mtdir (area-dbdir acfg)) -;;; (pktdir (area-pktsdir acfg))) ;; conc mtdir "/.server-pkts"))) -;;; (print "Registering node " best-ip ":" port-num) -;;; (if (not mtdir) ;; require a home for this node to put or find databases -;;; #f -;;; (begin -;;; (if (not (directory? pktdir))(create-directory pktdir)) -;;; ;; server is started, now create pkt if needed -;;; (print "Starting server in " server-type " mode with port " port-num) -;;; (if (member server-type '(auto main)) ;; TODO: if auto, count number of servers registers, if > 3 then don't put out a pkt -;;; (begin -;;; (area-pktid-set! acfg -;;; (write-alist->pkt -;;; pktdir -;;; `((hostname . ,(get-host-name)) -;;; (ipaddr . ,best-ip) -;;; (port . ,port-num) -;;; (pid . ,(current-process-id))) -;;; pktspec: *pktspec* -;;; ptype: 'server)) -;;; (area-pktfile-set! acfg (conc pktdir "/" (area-pktid acfg) ".pkt")))) -;;; (area-port-set! acfg port-num) -;;; #;(mutex-unlock! (area-mutex acfg)))))) -;;; -;;; (define *cookie-seqnum* 0) -;;; (define (make-cookie key) -;;; (set! *cookie-seqnum* (add1 *cookie-seqnum*)) -;;; ;;(print "MAKE COOKIE CALLED -- on "servkey"-"*cookie-seqnum*) -;;; (conc key "-" *cookie-seqnum*) -;;; ) -;;; -;;; ;; dispatch locally if possible -;;; ;; -;;; (define (call-deliver-response acfg ipaddr port cookie data) -;;; (if (and (equal? (area-myaddr acfg) ipaddr) -;;; (equal? (area-port acfg) port)) -;;; (deliver-response acfg cookie data) -;;; ((rpc:procedure 'response ipaddr port) cookie data))) -;;; -;;; (define (deliver-response acfg cookie data) -;;; (let ((deliver-response-start (current-milliseconds))) -;;; (thread-start! (make-thread -;;; (lambda () -;;; (let loop ((tries-left 5)) -;;; ;;(print "TOP OF DELIVER_RESPONSE LOOP; triesleft="tries-left) -;;; ;;(pp (hash-table->alist (area-cookie2mbox acfg))) -;;; (let* ((mbox (hash-table-ref/default (area-cookie2mbox acfg) cookie #f))) -;;; (cond -;;; ((eq? 0 tries-left) -;;; (print "ulex:deliver-response: I give up. Mailbox never appeared. cookie="cookie) -;;; ) -;;; (mbox -;;; ;;(print "got mbox="mbox" got data="data" send.") -;;; (mailbox-send! mbox data)) -;;; (else -;;; ;;(print "no mbox yet. look for "cookie) -;;; (thread-sleep! (/ (- 6 tries-left) 10)) -;;; (loop (sub1 tries-left)))))) -;;; ;; (debug-pp (list (conc "ulex:deliver-response took " (- (current-milliseconds) deliver-response-start) " ms, cookie=" cookie " data=") data)) -;;; (sdbg> "deliver-response" "mailbox-send" deliver-response-start #f #f cookie) -;;; ) -;;; (conc "deliver-response thread for cookie="cookie)))) -;;; #t) -;;; -;;; ;; action: -;;; ;; immediate - quick actions, no need to put in queues -;;; ;; dbwrite - put in dbwrite queue -;;; ;; dbread - put in dbread queue -;;; ;; oslong - os actions, e.g. du, that could take a long time -;;; ;; osshort - os actions that should be quick, e.g. df -;;; ;; -;;; (define (request acfg from-ipaddr from-port servkey action cookie fname params) ;; std-peer-handler -;;; ;; NOTE: Use rpc:current-peer for getting return address -;;; (let* ((std-peer-handler-start (current-milliseconds)) -;;; ;; (raw-data (alist-ref 'data dat)) -;;; (rdat (hash-table-ref/default -;;; (area-rtable acfg) action #f)) ;; this looks up the sql query or other details indexed by the action -;;; (witem (make-witem ripaddr: from-ipaddr ;; rhost: from-host -;;; rport: from-port action: action -;;; rdat: rdat cookie: cookie -;;; servkey: servkey data: params ;; TODO - rename data to params -;;; caller: (rpc:current-peer)))) -;;; (if (not (equal? servkey (area-pktid acfg))) -;;; `(#f . ,(conc "I don't know you servkey=" servkey ", pktid=" (area-pktid acfg))) ;; immediately return this -;;; (let* ((ctype (if rdat -;;; (calldat-ctype rdat) ;; is this necessary? these should be identical -;;; action))) -;;; (sdbg> "std-peer-handler" "immediate" std-peer-handler-start #f #f) -;;; (case ctype -;;; ;; (dbwrite acfg rdat (cons from-ipaddr from-port) data))) -;;; ((full-ping) `(#t "ack to full ping" ,(work-queue-add acfg fname witem) ,cookie)) -;;; ((response) `(#t "ack from requestor" ,(deliver-response acfg fname params))) -;;; ((dbwrite) `(#t "db write submitted" ,(work-queue-add acfg fname witem) ,cookie)) -;;; ((dbread) `(#t "db read submitted" ,(work-queue-add acfg fname witem) ,cookie )) -;;; ((dbrw) `(#t "db read/write submitted" ,cookie)) -;;; ((osshort) `(#t "os short submitted" ,cookie)) -;;; ((oslong) `(#t "os long submitted" ,cookie)) -;;; (else `(#f "unrecognised action" ,ctype))))))) -;;; -;;; ;; Call this to start the actual server -;;; ;; -;;; ;; start_server -;;; ;; -;;; ;; mode: ' -;;; ;; handler: proc which takes pktrecieved as argument -;;; ;; -;;; -;;; (define (start-server acfg) -;;; (let* ((conn (find-free-port-and-open acfg)) -;;; (port (area-port acfg))) -;;; (rpc:publish-procedure! -;;; 'delist-db -;;; (lambda (fname) -;;; (hash-table-delete! (area-dbs acfg) fname))) -;;; (rpc:publish-procedure! -;;; 'calling-addr -;;; (lambda () -;;; (rpc:current-peer))) -;;; (rpc:publish-procedure! -;;; 'ping -;;; (lambda ()(real-ping acfg))) -;;; (rpc:publish-procedure! -;;; 'request -;;; (lambda (from-addr from-port servkey action cookie dbname params) -;;; (request acfg from-addr from-port servkey action cookie dbname params))) -;;; (rpc:publish-procedure! -;;; 'response -;;; (lambda (cookie res-dat) -;;; (deliver-response acfg cookie res-dat))) -;;; (area-ready-set! acfg #t) -;;; (area-conn-set! acfg conn) -;;; ((rpc:make-server conn) #f)));; ((tcp-listen (rpc:default-server-port)) #t) -;;; -;;; -;;; (define (launch acfg) ;; #!optional (proc std-peer-handler)) -;;; (print "starting launch") -;;; (update-known-servers acfg) ;; gotta do this on every start (thus why limit number of publicised servers) -;;; #;(let ((original-handler (current-exception-handler))) ;; is th -;;; (lambda (exception) -;;; (server-exit-procedure) -;;; (original-handler exception))) -;;; (on-exit (lambda () -;;; (shutdown acfg))) ;; (finalize-all-db-handles acfg))) -;;; ;; set up the rpc handler -;;; (let* ((th1 (make-thread -;;; (lambda ()(start-server acfg)) -;;; "server thread")) -;;; (th2 (make-thread -;;; (lambda () -;;; (print "th2 starting") -;;; (let loop () -;;; (work-queue-processor acfg) -;;; (print "work-queue-processor crashed!") -;;; (loop))) -;;; "work queue thread"))) -;;; (thread-start! th1) -;;; (thread-start! th2) -;;; (let loop () -;;; (thread-sleep! 0.025) -;;; (if (area-ready acfg) -;;; #t -;;; (loop))) -;;; ;; attempt to fix my address -;;; (let* ((all-addr (get-all-ips-sorted))) ;; could use (tcp-addresses conn)? -;;; (let loop ((rem-addrs all-addr)) -;;; (if (null? rem-addrs) -;;; (begin -;;; (print "ERROR: Failed to figure out the ip address of myself as a server. Giving up.") -;;; (exit 1)) ;; BUG Changeme to raising an exception -;;; -;;; (let* ((addr (car rem-addrs)) -;;; (good-addr (handle-exceptions -;;; exn -;;; #f -;;; ((rpc:procedure 'calling-addr addr (area-port acfg)))))) -;;; (if good-addr -;;; (begin -;;; (print "Got good-addr of " good-addr) -;;; (area-myaddr-set! acfg good-addr)) -;;; (loop (cdr rem-addrs))))))) -;;; (register-node acfg (area-myaddr acfg)(area-port acfg)) -;;; (print "INFO: Server started on " (area-myaddr acfg) ":" (area-port acfg)) -;;; ;; (update-known-servers acfg) ;; gotta do this on every start (thus why limit number of publicised servers) -;;; )) -;;; -;;; (define (clear-server-pkt acfg) -;;; (let ((pktf (area-pktfile acfg))) -;;; (if pktf (delete-file* pktf)))) -;;; -;;; (define (shutdown acfg) -;;; (let (;;(conn (area-conn acfg)) -;;; (pktf (area-pktfile acfg)) -;;; (port (area-port acfg))) -;;; (if pktf (delete-file* pktf)) -;;; (send-all "imshuttingdown") -;;; ;; (rpc:close-all-connections!) ;; don't know if this is actually needed -;;; (finalize-all-db-handles acfg))) -;;; -;;; (define (send-all msg) -;;; #f) -;;; -;;; ;; given a area record look up all the packets -;;; ;; -;;; (define (get-all-server-pkts acfg) -;;; (let ((all-pkt-files (glob (conc (area-pktsdir acfg) "/*.pkt")))) -;;; (map (lambda (pkt-file) -;;; (read-pkt->alist pkt-file pktspec: *pktspec*)) -;;; all-pkt-files))) -;;; -;;; #;((Z . "9a0212302295a19610d5796fce0370fa130758e9") -;;; (port . "34827") -;;; (pid . "28748") -;;; (hostname . "zeus") -;;; (T . "server") -;;; (D . "1549427032.0")) -;;; -;;; #;(define (get-my-best-address) -;;; (let ((all-my-addresses (get-all-ips))) ;; (vector->list (hostinfo-addresses (hostname->hostinfo (get-host-name)))))) -;;; (cond -;;; ((null? all-my-addresses) -;;; (get-host-name)) ;; no interfaces? -;;; ((eq? (length all-my-addresses) 1) -;;; (ip->string (car all-my-addresses))) ;; only one to choose from, just go with it -;;; (else -;;; (ip->string (car (filter (lambda (x) ;; take any but 127. -;;; (not (eq? (u8vector-ref x 0) 127))) -;;; all-my-addresses))))))) -;;; -;;; ;; whoami? I am my pkt -;;; ;; -;;; (define (whoami? acfg) -;;; (hash-table-ref/default (area-hosts acfg)(area-pktid acfg) #f)) -;;; -;;; ;;====================================================================== -;;; ;; "Client side" operations -;;; ;;====================================================================== -;;; -;;; (define (safe-call call-key host port . params) -;;; (handle-exceptions -;;; exn -;;; (begin -;;; (print "Call " call-key " to " host ":" port " failed") -;;; #f) -;;; (apply (rpc:procedure call-key host port) params))) -;;; -;;; ;; ;; convert to/from string / sexpr -;;; ;; -;;; ;; (define (string->sexpr str) -;;; ;; (if (string? str) -;;; ;; (with-input-from-string str read) -;;; ;; str)) -;;; ;; -;;; ;; (define (sexpr->string s) -;;; ;; (with-output-to-string (lambda ()(write s)))) -;;; -;;; ;; is the server alive? -;;; ;; -;;; (define (ping acfg host port) -;;; (let* ((myaddr (area-myaddr acfg)) -;;; (myport (area-port acfg)) -;;; (start-time (current-milliseconds)) -;;; (res (if (and (equal? myaddr host) -;;; (equal? myport port)) -;;; (real-ping acfg) -;;; ((rpc:procedure 'ping host port))))) -;;; (cons (- (current-milliseconds) start-time) -;;; res))) -;;; -;;; ;; returns ( ipaddr port alist-fname=>randnum ) -;;; (define (real-ping acfg) -;;; `(,(area-myaddr acfg) ,(area-port acfg) ,(get-host-stats acfg))) -;;; -;;; ;; is the server alive AND the queues processing? -;;; ;; -;;; #;(define (full-ping acfg servpkt) -;;; (let* ((start-time (current-milliseconds)) -;;; (res (send-message acfg servpkt '(full-ping) 'full-ping))) -;;; (cons (- (current-milliseconds) start-time) -;;; res))) ;; (equal? res "got ping")))) -;;; -;;; -;;; ;; look up all pkts and get the server id (the hash), port, host/ip -;;; ;; store this info in acfg -;;; ;; return the number of responsive servers found -;;; ;; -;;; ;; DO NOT VERIFY THAT THE SERVER IS ALIVE HERE. This is called at times where the current server is not yet alive and cannot ping itself -;;; ;; -;;; (define (update-known-servers acfg) -;;; ;; readll all pkts -;;; ;; foreach pkt; if it isn't me ping the server; if alive, add to hosts hash, else rm the pkt -;;; (let* ((start-time (current-milliseconds)) -;;; (all-pkts (delete-duplicates -;;; (append (get-all-server-pkts acfg) -;;; (hash-table-values (area-hosts acfg))))) -;;; (hostshash (area-hosts acfg)) -;;; (my-id (area-pktid acfg)) -;;; (pktsdir (area-pktsdir acfg)) ;; needed to remove pkts from non-responsive servers -;;; (numsrvs 0) -;;; (delpkt (lambda (pktsdir sid) -;;; (print "clearing out server " sid) -;;; (delete-file* (conc pktsdir "/" sid ".pkt")) -;;; (hash-table-delete! hostshash sid)))) -;;; (area-last-srvup-set! acfg (current-seconds)) -;;; (for-each -;;; (lambda (servpkt) -;;; (if (list? servpkt) -;;; ;; (pp servpkt) -;;; (let* ((shost (alist-ref 'ipaddr servpkt)) -;;; (sport (any->number (alist-ref 'port servpkt))) -;;; (res (handle-exceptions -;;; exn -;;; (begin -;;; ;; (print "INFO: bad server on " shost ":" sport) -;;; #f) -;;; (ping acfg shost sport))) -;;; (sid (alist-ref 'Z servpkt)) ;; Z code is our name for the server -;;; (url (conc shost ":" sport)) -;;; ) -;;; #;(if (or (not res) -;;; (null? res)) -;;; (begin -;;; (print "STRANGE: ping of " url " gave " res))) -;;; -;;; ;; (print "Got " res " from " shost ":" sport) -;;; (match res -;;; ((qduration . payload) -;;; ;; (print "Server pkt:" (alist-ref 'ipaddr servpkt) ":" (alist-ref 'port servpkt) -;;; ;; (if payload -;;; ;; "Success" "Fail")) -;;; (match payload -;;; ((host port stats) -;;; ;; (print "From " host ":" port " got stats: " stats) -;;; (if (and host port stats) -;;; (let ((url (conc host ":" port))) -;;; (hash-table-set! hostshash sid servpkt) -;;; ;; store based on host:port -;;; (hash-table-set! (area-hoststats acfg) sid stats)) -;;; (print "missing data from the server, not sure what that means!")) -;;; (set! numsrvs (+ numsrvs 1))) -;;; (#f -;;; (print "Removing pkt " sid " due to #f from server or failed ping") -;;; (delpkt pktsdir sid)) -;;; (else -;;; (print "Got ")(pp res)(print " from server ")(pp servpkt) " but response did not match (#f/#t . msg)"))) -;;; (else -;;; ;; here we delete the pkt - can't reach the server, remove it -;;; ;; however this logic is inadequate. we should mark the server as checked -;;; ;; and not good, if it happens a second time - then remove the pkt -;;; ;; or something similar. I.e. don't be too quick to assume the server is wedged or dead -;;; ;; could be it is simply too busy to reply -;;; (let ((bad-pings (hash-table-ref/default (area-health acfg) url 0))) -;;; (if (> bad-pings 1) ;; two bad pings - remove pkt -;;; (begin -;;; (print "INFO: " bad-pings " bad responses from " url ", deleting pkt " sid) -;;; (delpkt pktsdir sid)) -;;; (begin -;;; (print "INFO: " bad-pings " bad responses from " shost ":" sport " not deleting pkt yet") -;;; (hash-table-set! (area-health acfg) -;;; url -;;; (+ (hash-table-ref/default (area-health acfg) url 0) 1)) -;;; )) -;;; )))) -;;; ;; servpkt is not actually a pkt? -;;; (begin -;;; (print "Bad pkt " servpkt)))) -;;; all-pkts) -;;; (sdbg> "update-known-servers" "end" start-time #f #f " found " numsrvs -;;; " servers, pkts: " (map (lambda (p) -;;; (alist-ref 'Z p)) -;;; all-pkts)) -;;; numsrvs)) -;;; -;;; (defstruct srvstat -;;; (numfiles 0) ;; number of db files handled by this server - subtract 1 for the db being currently looked at -;;; (randnum #f) ;; tie breaker number assigned to by the server itself - applies only to the db under consideration -;;; (pkt #f)) ;; the server pkt -;;; -;;; ;;(define (srv->srvstat srvpkt) -;;; -;;; ;; Get the server best for given dbname and key -;;; ;; -;;; ;; NOTE: key is not currently used. The key points to the kind of query, this may be useful for directing read-only queries. -;;; ;; -;;; (define (get-best-server acfg dbname key) -;;; (let* (;; (servers (hash-table-values (area-hosts acfg))) -;;; (servers (area-hosts acfg)) -;;; (skeys (sort (hash-table-keys servers) string>=?)) ;; a stable listing -;;; (start-time (current-milliseconds)) -;;; (srvstats (make-hash-table)) ;; srvid => srvstat -;;; (url (conc (area-myaddr acfg) ":" (area-port acfg)))) -;;; ;; (print "scores for " dbname ": " (map (lambda (k)(cons k (calc-server-score acfg dbname k))) skeys)) -;;; (if (null? skeys) -;;; (if (> (update-known-servers acfg) 0) -;;; (get-best-server acfg dbname key) ;; some risk of infinite loop here, TODO add try counter -;;; (begin -;;; (print "ERROR: no server found!") ;; since this process is also a server this should never happen -;;; #f)) -;;; (begin -;;; ;; (print "in get-best-server with skeys=" skeys) -;;; (if (> (- (current-seconds) (area-last-srvup acfg)) 10) -;;; (begin -;;; (update-known-servers acfg) -;;; (sdbg> "get-best-server" "update-known-servers" start-time #f #f))) -;;; -;;; ;; for each server look at the list of dbfiles, total number of dbs being handled -;;; ;; and the rand number, save the best host -;;; ;; also do a delist-db for each server dbfile not used -;;; (let* ((best-server #f) -;;; (servers-to-delist (make-hash-table))) -;;; (for-each -;;; (lambda (srvid) -;;; (let* ((server (hash-table-ref/default servers srvid #f)) -;;; (stats (hash-table-ref/default (area-hoststats acfg) srvid '(())))) -;;; ;; (print "stats: " stats) -;;; (if server -;;; (let* ((dbweights (car stats)) -;;; (srvload (length (filter (lambda (x)(not (equal? dbname (car x)))) dbweights))) -;;; (dbrec (alist-ref dbname dbweights equal?)) ;; get the pair with fname . randscore -;;; (randnum (if dbrec -;;; dbrec ;; (cdr dbrec) -;;; 0))) -;;; (hash-table-set! srvstats srvid (make-srvstat numfiles: srvload randnum: randnum pkt: server)))))) -;;; skeys) -;;; -;;; (let* ((sorted (sort (hash-table-values srvstats) -;;; (lambda (a b) -;;; (let ((numfiles-a (srvstat-numfiles a)) -;;; (numfiles-b (srvstat-numfiles b)) -;;; (randnum-a (srvstat-randnum a)) -;;; (randnum-b (srvstat-randnum b))) -;;; (if (< numfiles-a numfiles-b) ;; Note, I don't think adding an offset works here. Goal was only move file handling to a different server if it has 2 less -;;; #t -;;; (if (and (equal? numfiles-a numfiles-b) -;;; (< randnum-a randnum-b)) -;;; #t -;;; #f)))))) -;;; (best (if (null? sorted) -;;; (begin -;;; (print "ERROR: should never be null due to self as server.") -;;; #f) -;;; (srvstat-pkt (car sorted))))) -;;; #;(print "SERVER(" url "): " dbname ": " (map (lambda (srv) -;;; (let ((p (srvstat-pkt srv))) -;;; (conc (alist-ref 'ipaddr p) ":" (alist-ref 'port p) -;;; "(" (srvstat-numfiles srv)","(srvstat-randnum srv)")"))) -;;; sorted)) -;;; best)))))) -;;; -;;; ;; send out an "I'm about to exit notice to all known servers" -;;; ;; -;;; (define (death-imminent acfg) -;;; '()) -;;; -;;; ;;====================================================================== -;;; ;; U L E X - T H E I N T E R E S T I N G S T U F F ! ! -;;; ;;====================================================================== -;;; -;;; ;; register a handler -;;; ;; NOTES: -;;; ;; dbinitsql is reserved for a list of sql statements for initializing the db -;;; ;; dbinitfn is reserved for a db init function, if exists called after dbinitsql -;;; ;; -;;; (define (register acfg key obj #!optional (ctype 'dbwrite)) -;;; (let ((ht (area-rtable acfg))) -;;; (if (hash-table-exists? ht key) -;;; (print "WARNING: redefinition of entry " key)) -;;; (hash-table-set! ht key (make-calldat obj: obj ctype: ctype)))) -;;; -;;; ;; usage: register-batch acfg '((key1 . sql1) (key2 . sql2) ... ) -;;; ;; NB// obj is often an sql query -;;; ;; -;;; (define (register-batch acfg ctype data) -;;; (let ((ht (area-rtable acfg))) -;;; (map (lambda (dat) -;;; (hash-table-set! ht (car dat)(make-calldat obj: (cdr dat) ctype: ctype))) -;;; data))) -;;; -;;; (define (initialize-area-calls-from-specfile area specfile) -;;; (let* ((callspec (with-input-from-file specfile read ))) -;;; (for-each (lambda (group) -;;; (register-batch -;;; area -;;; (car group) -;;; (cdr group))) -;;; callspec))) -;;; -;;; ;; get-rentry -;;; ;; -;;; (define (get-rentry acfg key) -;;; (hash-table-ref/default (area-rtable acfg) key #f)) -;;; -;;; (define (get-rsql acfg key) -;;; (let ((cdat (get-rentry acfg key))) -;;; (if cdat -;;; (calldat-obj cdat) -;;; #f))) -;;; -;;; -;;; -;;; ;; blocking call: -;;; ;; client server -;;; ;; ------ ------ -;;; ;; call() -;;; ;; send-message() -;;; ;; nmsg-send() -;;; ;; nmsg-receive() -;;; ;; nmsg-respond(ack,cookie) -;;; ;; ack, cookie -;;; ;; mbox-thread-wait(cookie) -;;; ;; nmsg-send(client,cookie,result) -;;; ;; nmsg-respond(ack) -;;; ;; return result -;;; ;; -;;; ;; reserved action: -;;; ;; 'immediate -;;; ;; 'dbinitsql -;;; ;; -;;; (define (call acfg dbname action params #!optional (count 0)) -;;; (let* ((call-start-time (current-milliseconds)) -;;; (srv (get-best-server acfg dbname action)) -;;; (post-get-start-time (current-milliseconds)) -;;; (rdat (hash-table-ref/default (area-rtable acfg) action #f)) -;;; (myid (trim-pktid (area-pktid acfg))) -;;; (srvid (trim-pktid (alist-ref 'Z srv))) -;;; (cookie (make-cookie myid))) -;;; (sdbg> "call" "get-best-server" call-start-time #f call-start-time " from: " myid " to server: " srvid " for " dbname " action: " action " params: " params " rdat: " rdat) -;;; (print "INFO: call to " (alist-ref 'ipaddr srv) ":" (alist-ref 'port srv) " from " (area-myaddr acfg) ":" (area-port acfg) " for " dbname) -;;; (if (and srv rdat) ;; need both to dispatch a request -;;; (let* ((ripaddr (alist-ref 'ipaddr srv)) -;;; (rsrvid (alist-ref 'Z srv)) -;;; (rport (any->number (alist-ref 'port srv))) -;;; (res-full (if (and (equal? ripaddr (area-myaddr acfg)) -;;; (equal? rport (area-port acfg))) -;;; (request acfg ripaddr rport (area-pktid acfg) action cookie dbname params) -;;; (safe-call 'request ripaddr rport -;;; (area-myaddr acfg) -;;; (area-port acfg) -;;; #;(area-pktid acfg) -;;; rsrvid -;;; action cookie dbname params)))) -;;; ;; (print "res-full: " res-full) -;;; (match res-full -;;; ((response-ok response-msg rem ...) -;;; (let* ((send-message-time (current-milliseconds)) -;;; ;; (match res-full -;;; ;; ((response-ok response-msg) -;;; ;; (response-ok (car res-full)) -;;; ;; (response-msg (cadr res-full) -;;; ) -;;; ;; (res (take res-full 3))) ;; ctype == action, TODO: converge on one term <<=== what was this? BUG -;;; ;; (print "ulex:call: send-message took " (- send-message-time post-get-start-time) " ms params=" params) -;;; (sdbg> "call" "send-message" post-get-start-time #f call-start-time) -;;; (cond -;;; ((not response-ok) #f) -;;; ((member response-msg '("db read submitted" "db write submitted")) -;;; (let* ((cookie-id (cadddr res-full)) -;;; (mbox (make-mailbox)) -;;; (mbox-time (current-milliseconds))) -;;; (hash-table-set! (area-cookie2mbox acfg) cookie-id mbox) -;;; (let* ((mbox-timeout-secs 20) -;;; (mbox-timeout-result 'MBOX_TIMEOUT) -;;; (res (mailbox-receive! mbox mbox-timeout-secs mbox-timeout-result)) -;;; (mbox-receive-time (current-milliseconds))) -;;; (hash-table-delete! (area-cookie2mbox acfg) cookie-id) -;;; (sdbg> "call" "mailbox-receive" mbox-time #f call-start-time " from: " myid " to server: " srvid " for " dbname) -;;; ;; (print "ulex:call mailbox-receive took " (- mbox-receive-time mbox-time) "ms params=" params) -;;; res))) -;;; (else -;;; (print "Unhandled response \""response-msg"\"") -;;; #f)) -;;; ;; depending on what action (i.e. ctype) is we will block here waiting for -;;; ;; all the data (mechanism to be determined) -;;; ;; -;;; ;; if res is a "working on it" then wait -;;; ;; wait for result -;;; ;; mailbox thread wait on -;;; -;;; ;; if res is a "can't help you" then try a different server -;;; ;; if res is a "ack" (e.g. for one-shot requests) then return res -;;; )) -;;; (else -;;; (if (< count 10) -;;; (let* ((url (conc (alist-ref 'ipaddr srv) ":" (alist-ref 'port srv)))) -;;; (thread-sleep! 1) -;;; (print "ERROR: Bad result from " url ", dbname: " dbname ", action: " action ", params: " params ". Trying again in 1 second.") -;;; (call acfg dbname action params (+ count 1))) -;;; (begin -;;; (error (conc "ERROR: " count " tries, still have improper response res-full=" res-full))))))) -;;; (begin -;;; (if (not rdat) -;;; (print "ERROR: action " action " not registered.") -;;; (if (< count 10) -;;; (begin -;;; (thread-sleep! 1) -;;; (area-hosts-set! acfg (make-hash-table)) ;; clear out all known hosts -;;; (print "ERROR: no server found, srv=" srv ", trying again in 1 seconds") -;;; (call acfg dbname action params (+ count 1))) -;;; (begin -;;; (error (conc "ERROR: no server found after 10 tries, srv=" srv ", giving up.")) -;;; #;(error "No server available")))))))) -;;; -;;; -;;; ;;====================================================================== -;;; ;; U T I L I T I E S -;;; ;;====================================================================== -;;; -;;; ;; get a signature for identifing this process -;;; ;; -;;; (define (get-process-signature) -;;; (cons (get-host-name)(current-process-id))) -;;; -;;; ;;====================================================================== -;;; ;; S Y S T E M S T U F F -;;; ;;====================================================================== -;;; -;;; ;; get normalized cpu load by reading from /proc/loadavg and -;;; ;; /proc/cpuinfo return all three values and the number of real cpus -;;; ;; and the number of threads returns alist '((adj-cpu-load -;;; ;; . normalized-proc-load) ... etc. keys: adj-proc-load, -;;; ;; adj-core-load, 1m-load, 5m-load, 15m-load -;;; ;; -;;; (define (get-normalized-cpu-load) -;;; (let ((res (get-normalized-cpu-load-raw)) -;;; (default `((adj-proc-load . 2) ;; there is no right answer -;;; (adj-core-load . 2) -;;; (1m-load . 2) -;;; (5m-load . 0) ;; causes a large delta - thus causing default of throttling if stuff goes wrong -;;; (15m-load . 0) -;;; (proc . 1) -;;; (core . 1) -;;; (phys . 1) -;;; (error . #t)))) -;;; (cond -;;; ((and (list? res) -;;; (> (length res) 2)) -;;; res) -;;; ((eq? res #f) default) ;; add messages? -;;; ((eq? res #f) default) ;; this would be the #eof -;;; (else default)))) -;;; -;;; (define (get-normalized-cpu-load-raw) -;;; (let* ((actual-host (get-host-name))) ;; #f is localhost -;;; (let ((data (append -;;; (with-input-from-file "/proc/loadavg" read-lines) -;;; (with-input-from-file "/proc/cpuinfo" read-lines) -;;; (list "end"))) -;;; (load-rx (regexp "^([\\d\\.]+)\\s+([\\d\\.]+)\\s+([\\d\\.]+)\\s+.*$")) -;;; (proc-rx (regexp "^processor\\s+:\\s+(\\d+)\\s*$")) -;;; (core-rx (regexp "^core id\\s+:\\s+(\\d+)\\s*$")) -;;; (phys-rx (regexp "^physical id\\s+:\\s+(\\d+)\\s*$")) -;;; (max-num (lambda (p n)(max (string->number p) n)))) -;;; ;; (print "data=" data) -;;; (if (null? data) ;; something went wrong -;;; #f -;;; (let loop ((hed (car data)) -;;; (tal (cdr data)) -;;; (loads #f) -;;; (proc-num 0) ;; processor includes threads -;;; (phys-num 0) ;; physical chip on motherboard -;;; (core-num 0)) ;; core -;;; ;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num) -;;; (if (null? tal) ;; have all our data, calculate normalized load and return result -;;; (let* ((act-proc (+ proc-num 1)) -;;; (act-phys (+ phys-num 1)) -;;; (act-core (+ core-num 1)) -;;; (adj-proc-load (/ (car loads) act-proc)) -;;; (adj-core-load (/ (car loads) act-core)) -;;; (result -;;; (append (list (cons 'adj-proc-load adj-proc-load) -;;; (cons 'adj-core-load adj-core-load)) -;;; (list (cons '1m-load (car loads)) -;;; (cons '5m-load (cadr loads)) -;;; (cons '15m-load (caddr loads))) -;;; (list (cons 'proc act-proc) -;;; (cons 'core act-core) -;;; (cons 'phys act-phys))))) -;;; result) -;;; (regex-case -;;; hed -;;; (load-rx ( x l1 l5 l15 ) (loop (car tal)(cdr tal)(map string->number (list l1 l5 l15)) proc-num phys-num core-num)) -;;; (proc-rx ( x p ) (loop (car tal)(cdr tal) loads (max-num p proc-num) phys-num core-num)) -;;; (phys-rx ( x p ) (loop (car tal)(cdr tal) loads proc-num (max-num p phys-num) core-num)) -;;; (core-rx ( x c ) (loop (car tal)(cdr tal) loads proc-num phys-num (max-num c core-num))) -;;; (else -;;; (begin -;;; ;; (print "NO MATCH: " hed) -;;; (loop (car tal)(cdr tal) loads proc-num phys-num core-num)))))))))) -;;; -;;; (define (get-host-stats acfg) -;;; (let ((stats-hash (area-stats acfg))) -;;; ;; use this opportunity to remove references to dbfiles which have not been accessed in a while -;;; (for-each -;;; (lambda (dbname) -;;; (let* ((stats (hash-table-ref stats-hash dbname)) -;;; (last-access (stat-when stats))) -;;; (if (and (> last-access 0) ;; if zero then there has been no access -;;; (> (- (current-seconds) last-access) 10)) ;; not used in ten seconds -;;; (begin -;;; (print "Removing " dbname " from stats list") -;;; (hash-table-delete! stats-hash dbname) ;; remove from stats hash -;;; (stat-dbs-set! stats (hash-table-keys stats)))))) -;;; (hash-table-keys stats-hash)) -;;; -;;; `(,(hash-table->alist (area-dbs acfg)) ;; dbname => randnum -;;; ,(map (lambda (dbname) ;; dbname is the db name -;;; (cons dbname (stat-when (hash-table-ref stats-hash dbname)))) -;;; (hash-table-keys stats-hash)) -;;; (cpuload . ,(get-normalized-cpu-load))))) -;;; #;(stats . ,(map (lambda (k) ;; create an alist from the stats data -;;; (cons k (stat->alist (hash-table-ref (area-stats acfg) k)))) -;;; (hash-table-keys (area-stats acfg)))) -;;; -;;; #;(trace -;;; ;; assv -;;; ;; cdr -;;; ;; caar -;;; ;; ;; cdr -;;; ;; call -;;; ;; finalize-all-db-handles -;;; ;; get-all-server-pkts -;;; ;; get-normalized-cpu-load -;;; ;; get-normalized-cpu-load-raw -;;; ;; launch -;;; ;; nmsg-send -;;; ;; process-db-queries -;;; ;; receive-message -;;; ;; std-peer-handler -;;; ;; update-known-servers -;;; ;; work-queue-processor -;;; ) -;;; -;;; ;;====================================================================== -;;; ;; netutil -;;; ;; move this back to ulex-netutil.scm someday? -;;; ;;====================================================================== -;;; -;;; ;; #include -;;; ;; #include -;;; ;; #include -;;; ;; #include -;;; -;;; (foreign-declare "#include \"sys/types.h\"") -;;; (foreign-declare "#include \"sys/socket.h\"") -;;; (foreign-declare "#include \"ifaddrs.h\"") -;;; (foreign-declare "#include \"arpa/inet.h\"") -;;; -;;; ;; get IP addresses from ALL interfaces -;;; (define get-all-ips -;;; (foreign-safe-lambda* scheme-object () -;;; " -;;; -;;; // from https://stackoverflow.com/questions/17909401/linux-c-get-default-interfaces-ip-address : -;;; -;;; -;;; C_word lst = C_SCHEME_END_OF_LIST, len, str, *a; -;;; // struct ifaddrs *ifa, *i; -;;; // struct sockaddr *sa; -;;; -;;; struct ifaddrs * ifAddrStruct = NULL; -;;; struct ifaddrs * ifa = NULL; -;;; void * tmpAddrPtr = NULL; -;;; -;;; if ( getifaddrs(&ifAddrStruct) != 0) -;;; C_return(C_SCHEME_FALSE); -;;; -;;; // for (i = ifa; i != NULL; i = i->ifa_next) { -;;; for (ifa = ifAddrStruct; ifa != NULL; ifa = ifa->ifa_next) { -;;; if (ifa->ifa_addr->sa_family==AF_INET) { // Check it is -;;; // a valid IPv4 address -;;; tmpAddrPtr = &((struct sockaddr_in *)ifa->ifa_addr)->sin_addr; -;;; char addressBuffer[INET_ADDRSTRLEN]; -;;; inet_ntop(AF_INET, tmpAddrPtr, addressBuffer, INET_ADDRSTRLEN); -;;; // printf(\"%s IP Address %s\\n\", ifa->ifa_name, addressBuffer); -;;; len = strlen(addressBuffer); -;;; a = C_alloc(C_SIZEOF_PAIR + C_SIZEOF_STRING(len)); -;;; str = C_string(&a, len, addressBuffer); -;;; lst = C_a_pair(&a, str, lst); -;;; } -;;; -;;; // else if (ifa->ifa_addr->sa_family==AF_INET6) { // Check it is -;;; // // a valid IPv6 address -;;; // tmpAddrPtr = &((struct sockaddr_in6 *)ifa->ifa_addr)->sin6_addr; -;;; // char addressBuffer[INET6_ADDRSTRLEN]; -;;; // inet_ntop(AF_INET6, tmpAddrPtr, addressBuffer, INET6_ADDRSTRLEN); -;;; //// printf(\"%s IP Address %s\\n\", ifa->ifa_name, addressBuffer); -;;; // len = strlen(addressBuffer); -;;; // a = C_alloc(C_SIZEOF_PAIR + C_SIZEOF_STRING(len)); -;;; // str = C_string(&a, len, addressBuffer); -;;; // lst = C_a_pair(&a, str, lst); -;;; // } -;;; -;;; // else { -;;; // printf(\" not an IPv4 address\\n\"); -;;; // } -;;; -;;; } -;;; -;;; freeifaddrs(ifa); -;;; C_return(lst); -;;; -;;; ")) -;;; -;;; ;; Change this to bias for addresses with a reasonable broadcast value? -;;; ;; -;;; (define (ip-pref-less? a b) -;;; (let* ((rate (lambda (ipstr) -;;; (regex-case ipstr -;;; ( "^127\\." _ 0 ) -;;; ( "^(10\\.0|192\\.168\\.)\\..*" _ 1 ) -;;; ( else 2 ) )))) -;;; (< (rate a) (rate b)))) -;;; -;;; -;;; (define (get-my-best-address) -;;; (let ((all-my-addresses (get-all-ips)) -;;; ;;(all-my-addresses-old (vector->list (hostinfo-addresses (hostname->hostinfo (get-host-name))))) -;;; ) -;;; (cond -;;; ((null? all-my-addresses) -;;; (get-host-name)) ;; no interfaces? -;;; ((eq? (length all-my-addresses) 1) -;;; (car all-my-addresses)) ;; only one to choose from, just go with it -;;; -;;; (else -;;; (car (sort all-my-addresses ip-pref-less?))) -;;; ;; (else -;;; ;; (ip->string (car (filter (lambda (x) ;; take any but 127. -;;; ;; (not (eq? (u8vector-ref x 0) 127))) -;;; ;; all-my-addresses)))) -;;; -;;; ))) -;;; -;;; (define (get-all-ips-sorted) -;;; (sort (get-all-ips) ip-pref-less?)) -;;; -;;; - + (map address-info-host + (filter (lambda (x) + (equal? (address-info-type x) "tcp")) + (address-infos (get-host-name))))) + +)