Index: Makefile
==================================================================
--- Makefile
+++ Makefile
@@ -36,31 +36,30 @@
subrun.scm archive.scm env.scm \
diff-report.scm cgisetup/models/pgdb.scm
# module source files
MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm \
- tcp-transportmod.scm rmtmod.scm portlogger.scm
+ tcp-transportmod.scm rmtmod.scm portlogger.scm apimod.scm
transport-mode.scm : transport-mode.scm.template
- @if [[ -e transport-mode.scm ]];then \
- echo "WARNING: transport-mode.scm.template is newer than transport-mode.scm"; else \
- cp transport-mode.scm.template transport-mode.scm; fi
+ cp transport-mode.scm.template transport-mode.scm
dashboard-transport-mode.scm : dashboard-transport-mode.scm.template
- @if [[ -e dashboard-transport-mode.scm ]];then \
- echo "WARNING: dashboard-transport-mode.scm.template is newer than dashboard-transport-mode.scm"; else \
- cp dashboard-transport-mode.scm.template dashboard-transport-mode.scm; fi
+ cp dashboard-transport-mode.scm.template dashboard-transport-mode.scm
-megatest.scm : transport-mode.scm
-dashboard.scm : dashboard-transport-mode.scm
+mtest : transport-mode.scm
+dboard : dashboard-transport-mode.scm
# dbmod.import.o is just a hack here
mofiles/portlogger.o : mofiles/dbmod.o
mofiles/dbfile.o : \
- mofiles/debugprint.o mofiles/commonmod.o
-
+ mofiles/debugprint.o mofiles/commonmod.o
+mofiles/apimod.o : mofiles/commonmod.o mofiles/tcp-transportmod.o
+mofiles/dbmod.o : mofiles/dbfile.o
+mofiles/api.o : mofiles/apimod.o
+mofiles/commonmod.o : mofiles/debugprint.o
configf.o : commonmod.import.o
mofiles/dbfile.o : mofiles/debugprint.o
mofiles/rmtmod.o mofiles/dbmod.o : mofiles/dbfile.o mofiles/commonmod.o mofiles/debugprint.o
db.o : mofiles/dbmod.o mofiles/dbfile.o
mofiles/debugprint.o : mofiles/mtargs.o
Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,51 @@
# along with Megatest. If not, see .
TODO
====
+23WW48
+. Add calls-per-minute to db access stats
+. Find out why start-server calls are taking 250ms and fix
+. Allow two or three servers to run for any given db
+. Update avg call count/sec every 30 sec in no-sync
+. get server uses no-sync process info to decide which server to suggest
+. Use process table to decide who will do sync back
+. Fix metadat being synced over and over
+
+23WW47
+. Finding server
+.. look at .servinfo for likely prime main
+.. ask the .servinfo prime main for real prime main
+.. save prime main (for how long, 10 seconds or 10 minutes?)
+
+. Starting prime main
+.. get servinfo files - START
+.. no files? create my servinfo file, goto START
+.. have files? am I the prime main according to servinfo files?
+.. no, I'm not the prime main, ping prime main
+.. ping is good, prime main exists, register self as server if on same host as prime main DONE
+.. no pirng response, remove the .servinfo file - goto START
+.. if I am prime main according to .servinfo files, register directly in no-sync
+
+. Starting non-main
+.. get servinfo files
+.. no files? launch server for main.db
+.. have files? pick out prime main
+.. register self as server with prime main
+
+23WW46 - v1.80 branch
+. Use file semaphore to kill tests, eliminate db load of the KILLREQ query
+. Merge this change to revolution branch
+23WW45 - the revolution branch
+. Add "fast" db start option (no handshaking over NFS)
+. Add server-ro to server types (just "server" is fine for read/write).
+. [DONE] Create pause-server and resume-server calls
+. Create rsync or cp sync to MTRAH function
+. Change rmt:send-receive to divert calls to read-only server when possible
+. [DONE] Change start server to call main.db server for 1..N.db servers, block until server is read for use.
+
23WW21
. Dashboard needs its own cache db in /tmp
23WW07
. Remove use of *dbstruct-dbs*
Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -18,317 +18,146 @@
;;
;;======================================================================
(declare (unit api))
(declare (uses db))
+(declare (uses apimod))
+
(declare (uses debugprint))
(declare (uses commonmod))
(declare (uses dbmod))
(declare (uses dbfile))
(declare (uses tasks))
(declare (uses tcp-transportmod))
(import commonmod)
+(import apimod)
(import dbmod)
(import dbfile)
(import debugprint)
(import tcp-transportmod)
(use srfi-69
srfi-18
posix
matchable
- s11n)
-
-;; allow these queries through without starting a server
-;;
-(define api:read-only-queries
- '(get-key-val-pairs
- get-var
- get-keys
- get-key-vals
- test-toplevel-num-items
- get-test-info-by-id
- get-test-state-status-by-id
- get-steps-info-by-id
- get-data-info-by-id
- test-get-rundir-from-test-id
- get-count-tests-running-for-testname
- get-count-tests-running
- get-count-tests-running-in-jobgroup
- get-previous-test-run-record
- get-matching-previous-test-run-records
- test-get-logfile-info
- test-get-records-for-index-file
- get-testinfo-state-status
- test-get-top-process-pid
- test-get-paths-matching-keynames-target-new
- get-prereqs-not-met
- get-count-tests-running-for-run-id
- get-run-info
- get-run-status
- get-run-state
- get-run-stats
- get-run-times
- get-target
- get-targets
- ;; register-run
- get-tests-tags
- get-test-times
- get-tests-for-run
- get-tests-for-run-state-status
- get-test-id
- get-tests-for-runs-mindata
- get-tests-for-run-mindata
- get-run-name-from-id
- get-runs
- simple-get-runs
- get-num-runs
- get-runs-cnt-by-patt
- get-all-run-ids
- get-prev-run-ids
- get-run-ids-matching-target
- get-runs-by-patt
- get-steps-data
- get-steps-for-test
- read-test-data
- read-test-data-varpatt
- login
- tasks-get-last
- testmeta-get-record
- have-incompletes?
- get-changed-record-ids
- get-all-runids
- get-changed-record-test-ids
- get-changed-record-run-ids
- get-run-record-ids
- get-not-completed-cnt))
-
-(define api:write-queries
- '(
- get-keys-write ;; dummy "write" query to force server start
-
- ;; SERVERS
- ;; start-server
- ;; kill-server
-
- ;; TESTS
- test-set-state-status-by-id
- delete-test-records
- delete-old-deleted-test-records
- test-set-state-status
- test-set-top-process-pid
- set-state-status-and-roll-up-items
-
- update-pass-fail-counts
- top-test-set-per-pf-counts ;; (db:top-test-set-per-pf-counts (db:get-db *db* 5) 5 "runfirst")
-
- ;; RUNS
- register-run
- set-tests-state-status
- delete-run
- lock/unlock-run
- update-run-event_time
- mark-incomplete
- set-state-status-and-roll-up-run
- ;; STEPS
- teststep-set-status!
- delete-steps-for-test
- ;; TEST DATA
- test-data-rollup
- csv->test-data
-
- ;; MISC
- sync-cachedb->db
- drop-all-triggers
- create-all-triggers
- update-tesdata-on-repilcate-db
-
- ;; TESTMETA
- testmeta-add-record
- testmeta-update-field
-
- ;; TASKS
- tasks-add
- tasks-set-state-given-param-key
- ))
-
-(define *db-write-mutexes* (make-hash-table))
-(define *server-signature* #f)
-;; ;; These are called by the server on recipt of /api calls
-;; ;; - keep it simple, only return the actual result of the call, i.e. no meta info here
-;; ;;
-;; ;; - returns #( flag result )
-;; ;;
-;; (define (api:execute-requests dbstruct dat)
-;; (if (> *api-process-request-count* 50)
-;; (begin
-;; (if (common:low-noise-print 30 "too many threads")
-;; (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay."))
-;; ;; (thread-sleep! 0.5) ;; take a nap - no, the napping is moved to the clients via tt:backoff-incr
-;; ))
-;; (cond
-;; ((not (vector? dat)) ;; it is an error to not receive a vector
-;; (vector #f (vector #f "remote must be called with a vector")))
-;; (else
-;; (let* ((cmd-in (vector-ref dat 0))
-;; (cmd (if (symbol? cmd-in)
-;; cmd-in
-;; (string->symbol cmd-in)))
-;; (params (vector-ref dat 1))
-;; (run-id (if (null? params)
-;; 0
-;; (car params)))
-;; (write-mutex (if (hash-table-exists? *db-write-mutexes* run-id)
-;; (hash-table-ref *db-write-mutexes* run-id)
-;; (let* ((newmutex (make-mutex)))
-;; (hash-table-set! *db-write-mutexes* run-id newmutex)
-;; newmutex)))
-;; (start-t (current-milliseconds))
-;; (readonly-mode (dbr:dbstruct-read-only dbstruct))
-;; (readonly-command (member cmd api:read-only-queries))
-;; (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))))
-;; (if (not readonly-command)
-;; (mutex-lock! write-mutex))
-;; (let* ((tmppath (dbr:dbstruct-tmppath dbstruct))
-;; (clean-run-id (cond
-;; ((number? run-id) run-id)
-;; ((equal? run-id #f) "main")
-;; (else "other")))
-;; (crumbfile (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params)))
-;; (res
-;; (if writecmd-in-readonly-mode
-;; (conc "attempt to run write command "cmd" on a read-only database")
-;; (api:dispatch-request dbstruct cmd run-id params))))
-;; (delete-file* crumbfile)
-;; (if (not readonly-command)
-;; (mutex-unlock! write-mutex))
-;;
-;; ;; save all stats
-;; (let ((delta-t (- (current-milliseconds)
-;; start-t))
-;; (modified-cmd (if (eq? cmd 'general-call)
-;; (string->symbol (conc "general-call-" (car params)))
-;; cmd)))
-;; (hash-table-set! *db-api-call-time* modified-cmd
-;; (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '()))))
-;; (if writecmd-in-readonly-mode
-;; (begin
-;; #;(common:telemetry-log (conc "api-out:"(->string cmd))
-;; payload: `((params . ,params)
-;; (ok-res . #t)))
-;; (vector #f res))
-;; (begin
-;; #;(common:telemetry-log (conc "api-out:"(->string cmd))
-;; payload: `((params . ,params)
-;; (ok-res . #f)))
-;; (vector #t res))))))))
-
-(define *api-threads* '())
-(define (api:register-thread th-in)
- (set! *api-threads* (cons (cons th-in (current-seconds)) *api-threads*)))
-
-(define (api:unregister-thread th-in)
- (set! *api-threads* (filter (lambda (thdat)
- (not (eq? th-in (car thdat))))
- *api-threads*)))
-
-(define (api:remove-dead-or-terminated)
- (set! *api-threads* (filter (lambda (thdat)
- (not (member (thread-state (car thdat)) '(terminated dead))))
- *api-threads*)))
-
-(define (api:get-count-threads-alive)
- (length *api-threads*))
-
+ s11n
+ typed-records)
+
+
+;; QUEUE METHOD
+
+(define (api:tcp-dispatch-request-make-handler-new dbstruct) ;; cmd run-id params)
+ (api:tcp-dispatch-request-make-handler-core dbstruct api:dispatch-request))
+
;; indat is (cmd run-id params meta)
;;
;; WARNING: Do not print anything in the lambda of this function as it
;; reads/writes to current in/out port
;;
-(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params)
+(define (api:tcp-dispatch-request-make-handler-old dbstruct) ;; cmd run-id params)
(assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
(if (not *server-signature*)
(set! *server-signature* (tt:mk-signature *toppath*)))
(lambda (indat)
(api:register-thread (current-thread))
- (let* (;; (indat (deserialize))
- (newcount (+ *api-process-request-count* 1))
- (numthreads (api:get-count-threads-alive))
- (delay-wait (if (> newcount 10)
- (- newcount 10)
- 0))
- (normal-proc (lambda (cmd run-id params)
- (case cmd
- ((ping) *server-signature*)
- (else
- (api:dispatch-request dbstruct cmd run-id params))))))
- (set! *api-process-request-count* newcount)
- (set! *db-last-access* (current-seconds))
- (if (not (eq? newcount numthreads))
- (begin
- (api:remove-dead-or-terminated)
- (let ((threads-now (api:get-count-threads-alive)))
- (debug:print 0 *default-log-port* "WARNING: newcount="newcount", numthreads="numthreads", remaining="threads-now)
- (set! newcount threads-now))))
- (match indat
- ((cmd run-id params meta)
- (let* ((db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id))
- (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct))))
- (case cmd
- ((ping) #t) ;; we are fine
- (else
- (if (not ok)(debug:print 0 *default-log-port* "ERROR: "cmd", run-id "run-id", not correct for dbfname "(dbr:dbstruct-dbfname dbstruct)))
- (assert ok "FATAL: database file and run-id not aligned.")))))
- (ttdat *server-info*)
- (server-state (tt-state ttdat))
- (status (cond
- ((> newcount 3) 'busy)
- ;; ((> newcount 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down.
- (else 'ok)))
- (errmsg (case status
- ((busy) (conc "Server overloaded, "newcount" threads in flight"))
- ((loaded) (conc "Server loaded, "newcount" threads in flight"))
- (else #f)))
- (result (case status
- ((busy)
- (if (eq? cmd 'ping)
- (normal-proc cmd run-id params)
- ;; newcount must be greater than 5 for busy
- (* 1 (- newcount 3)) ;; was 15
- )) ;; (- newcount 29)) ;; call back in as many seconds
- ((loaded)
-;; (if (eq? (rmt:transport-mode) 'tcp)
-;; (thread-sleep! 0.5))
- (normal-proc cmd run-id params))
- (else
- (normal-proc cmd run-id params))))
- (meta (case cmd
- ((ping) `((sstate . ,server-state)))
- (else `((wait . ,delay-wait)))))
- (payload (list status errmsg result meta)))
- (set! *api-process-request-count* (- *api-process-request-count* 1))
- ;; (serialize payload)
- (api:unregister-thread (current-thread))
- payload))
- (else
- (assert #f "FATAL: failed to deserialize indat "indat))))))
-
+ (let* ((result
+ (let* ((numthreads (api:get-count-threads-alive))
+ (delay-wait (if (> numthreads 10)
+ (- numthreads 10)
+ 0))
+ (normal-proc (lambda (cmd run-id params)
+ (case cmd
+ ((ping) *server-signature*)
+ (else
+ (api:dispatch-request dbstruct cmd run-id params))))))
+ (set! *api-process-request-count* numthreads)
+ (set! *db-last-access* (current-seconds))
+;; (if (not (eq? numthreads numthreads))
+;; (begin
+;; (api:remove-dead-or-terminated)
+;; (let ((threads-now (api:get-count-threads-alive)))
+;; (debug:print 0 *default-log-port* "WARNING: numthreads="numthreads", numthreads="numthreads", remaining="threads-now)
+;; (set! numthreads threads-now))))
+ (match indat
+ ((cmd run-id params meta)
+ (let* ((start-t (current-milliseconds))
+ (db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id))
+ (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct))))
+ (case cmd
+ ((ping) #t) ;; we are fine
+ (else
+ (assert ok "FATAL: database file and run-id not aligned.")))))
+ (ttdat *server-info*)
+ (server-state (tt-state ttdat))
+ (maxthreads 20) ;; make this a parameter?
+ (status (cond
+ ((and (> numthreads maxthreads)
+ (> (random 100) 70)) ;; allow a 30% probability to go through so we can figure out what is going wrong in main.db server.
+ 'busy)
+ ;; ((> numthreads 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down.
+ (else 'ok)))
+ (errmsg (case status
+ ((busy) (conc "Server overloaded, "numthreads" threads in flight"))
+ ((loaded) (conc "Server loaded, "numthreads" threads in flight"))
+ (else #f)))
+ (result (case status
+ ((busy)
+ (if (eq? cmd 'ping)
+ (normal-proc cmd run-id params)
+ ;; numthreads must be greater than 5 for busy
+ (* 0.1 (- numthreads maxthreads)) ;; was 15 - return a number for the remote to delay
+ )) ;; (- numthreads 29)) ;; call back in as many seconds
+ ((loaded)
+ ;; (if (eq? (rmt:transport-mode) 'tcp)
+ ;; (thread-sleep! 0.5))
+ (normal-proc cmd run-id params))
+ (else
+ (normal-proc cmd run-id params))))
+ (meta (case cmd
+ ((ping) `((sstate . ,server-state)))
+ (else `((wait . ,delay-wait)))))
+ (payload (list status errmsg result meta)))
+ ;; (cmd run-id params meta)
+ (db:add-stats cmd run-id params (- (current-milliseconds) start-t))
+ payload))
+ (else
+ (assert #f "FATAL: failed to deserialize indat "indat))))))
+ ;; (set! *api-process-request-count* (- *api-process-request-count* 1))
+ ;; (serialize payload)
+
+ (api:unregister-thread (current-thread))
+ result)))
+
+(define api:tcp-dispatch-request-make-handler api:tcp-dispatch-request-make-handler-old) ;; choose -old or -new
+
+(define *api-halt-writes* #f)
(define (api:dispatch-request dbstruct cmd run-id params)
(if (not *no-sync-db*)
(db:open-no-sync-db))
+ (let* ((start-time (current-milliseconds)))
+ (if (member cmd api:write-queries)
+ (let loop ()
+ (if *api-halt-writes*
+ (begin
+ (thread-sleep! 0.2)
+ (if (< (- (current-milliseconds) start-time)
+ 5000) ;; hope it don't take more than five seconds to sync
+ (loop-time)
+ #;(debug:print 0 *default-log-port* "ERROR: writes halted for more than 5 seconds, sync might be taking too long"))))))
+ (db:add-stats 'api-write-blocking-for-sync run-id params (- (current-milliseconds) start-time)))
(case cmd
;;===============================================
;; READ/WRITE QUERIES
;;===============================================
((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl
;; SERVERS
- ((start-server) (apply server:kind-run params))
+ ((start-server) (apply tt:server-process-run params))
((kill-server) (set! *server-run* #f))
;; TESTS
;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params))
@@ -513,43 +342,5 @@
((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params))
(else
(debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
(conc "ERROR: BAD api call " cmd))))
-;; http-server send-response
-;; api:process-request
-;; db:*
-;;
-;; NB// Runs on the server as part of the server loop
-;;
-(define (api:process-request dbstruct $) ;; the $ is the request vars proc
- (debug:print 4 *default-log-port* "server-id:" *server-id*)
- (let* ((cmd ($ 'cmd))
- (paramsj ($ 'params))
- (key ($ 'key))
- (params (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?)
- (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key)
- (if (equal? key *server-id*)
- (begin
- (set! *api-process-request-count* (+ *api-process-request-count* 1))
- (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result )
- (success (vector-ref resdat 0))
- (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?)
- (debug:print 4 *default-log-port* "res:" res)
- (if (not success)
- (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params))
- (if (> *api-process-request-count* *max-api-process-requests*)
- (set! *max-api-process-requests* *api-process-request-count*))
- (set! *api-process-request-count* (- *api-process-request-count* 1))
- ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds
- ;; (rmt:dat->json-str
- ;; (if (or (string? res)
- ;; (list? res)
- ;; (number? res)
- ;; (boolean? res))
- ;; res
- ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res)))))
- (db:obj->string res transport: 'http)))
- (begin
- (debug:print 0 *default-log-port* "Server refused to process request. Server id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params)
- (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http)))))
-
Index: apimod.scm
==================================================================
--- apimod.scm
+++ apimod.scm
@@ -18,15 +18,315 @@
;;======================================================================
(declare (unit apimod))
(declare (uses commonmod))
+(declare (uses debugprint))
+(declare (uses dbmod))
+(declare (uses dbfile))
+(declare (uses tcp-transportmod))
(module apimod
*
(import scheme chicken data-structures extras)
-(import (prefix sqlite3 sqlite3:) posix typed-records srfi-18)
+(import (prefix sqlite3 sqlite3:) posix matchable typed-records srfi-1 srfi-18 srfi-69 )
(import commonmod)
+(import debugprint)
+(import dbmod)
+(import dbfile)
+(import tcp-transportmod)
+
+;; allow these queries through without starting a server
+;;
+(define api:read-only-queries
+ '(get-key-val-pairs
+ get-var
+ get-keys
+ get-key-vals
+ test-toplevel-num-items
+ get-test-info-by-id
+ get-test-state-status-by-id
+ get-steps-info-by-id
+ get-data-info-by-id
+ test-get-rundir-from-test-id
+ get-count-tests-running-for-testname
+ get-count-tests-running
+ get-count-tests-running-in-jobgroup
+ get-previous-test-run-record
+ get-matching-previous-test-run-records
+ test-get-logfile-info
+ test-get-records-for-index-file
+ get-testinfo-state-status
+ test-get-top-process-pid
+ test-get-paths-matching-keynames-target-new
+ get-prereqs-not-met
+ get-count-tests-running-for-run-id
+ get-run-info
+ get-run-status
+ get-run-state
+ get-run-stats
+ get-run-times
+ get-target
+ get-targets
+ ;; register-run
+ get-tests-tags
+ get-test-times
+ get-tests-for-run
+ get-tests-for-run-state-status
+ get-test-id
+ get-tests-for-runs-mindata
+ get-tests-for-run-mindata
+ get-run-name-from-id
+ get-runs
+ simple-get-runs
+ get-num-runs
+ get-runs-cnt-by-patt
+ get-all-run-ids
+ get-prev-run-ids
+ get-run-ids-matching-target
+ get-runs-by-patt
+ get-steps-data
+ get-steps-for-test
+ read-test-data
+ read-test-data-varpatt
+ login
+ tasks-get-last
+ testmeta-get-record
+ have-incompletes?
+ get-changed-record-ids
+ get-all-runids
+ get-changed-record-test-ids
+ get-changed-record-run-ids
+ get-run-record-ids
+ get-not-completed-cnt))
+
+(define api:write-queries
+ '(
+ get-keys-write ;; dummy "write" query to force server start
+
+ ;; SERVERS
+ ;; start-server
+ ;; kill-server
+
+ ;; TESTS
+ test-set-state-status-by-id
+ delete-test-records
+ delete-old-deleted-test-records
+ test-set-state-status
+ test-set-top-process-pid
+ set-state-status-and-roll-up-items
+
+ update-pass-fail-counts
+ top-test-set-per-pf-counts ;; (db:top-test-set-per-pf-counts (db:get-db *db* 5) 5 "runfirst")
+
+ ;; RUNS
+ register-run
+ set-tests-state-status
+ delete-run
+ lock/unlock-run
+ update-run-event_time
+ mark-incomplete
+ set-state-status-and-roll-up-run
+ ;; STEPS
+ teststep-set-status!
+ delete-steps-for-test
+ ;; TEST DATA
+ test-data-rollup
+ csv->test-data
+
+ ;; MISC
+ sync-cachedb->db
+ drop-all-triggers
+ create-all-triggers
+ update-tesdata-on-repilcate-db
+
+ ;; TESTMETA
+ testmeta-add-record
+ testmeta-update-field
+
+ ;; TASKS
+ tasks-add
+ tasks-set-state-given-param-key
+ ))
+
+(define *db-write-mutexes* (make-hash-table))
+(define *server-signature* #f)
+
+(define *api-threads* '())
+(define (api:register-thread th-in)
+ (set! *api-threads* (cons (cons th-in (current-seconds)) *api-threads*)))
+
+(define (api:unregister-thread th-in)
+ (set! *api-threads* (filter (lambda (thdat)
+ (not (eq? th-in (car thdat))))
+ *api-threads*)))
+
+(define (api:remove-dead-or-terminated)
+ (set! *api-threads* (filter (lambda (thdat)
+ (not (member (thread-state (car thdat)) '(terminated dead))))
+ *api-threads*)))
+
+(define (api:get-count-threads-alive)
+ (length *api-threads*))
+
+(define *api:last-stats-print* 0)
+(define *api-print-db-stats-mutex* (make-mutex))
+(define (api:print-db-stats)
+ (debug:print-info 0 *default-log-port* "Started periodic db stats printer")
+ (let loop ()
+ (mutex-lock! *api-print-db-stats-mutex*)
+ (if (> (- (current-seconds) *api:last-stats-print*) 15)
+ (begin
+ (dbmod:print-db-stats)
+ (set! *api:last-stats-print* (current-seconds))))
+ (mutex-unlock! *api-print-db-stats-mutex*)
+ (thread-sleep! 5)
+ (loop)))
+
+;; QUEUE METHOD
+
+(define *api:queue-mutex* (make-mutex))
+(define *api:queue-id* 0)
+
+(define *api:in-queue* '())
+(define *api:results* (make-hash-table)) ;; id->queue-item
+
+(defstruct api:queue-item
+ (proc #f)
+ (cmd #f)
+ (run-id #f)
+ (params #f)
+ (start-time (current-seconds))
+ (end-time #f)
+ (id #f)
+ (results #f))
+
+;; Add an item to the incoming queue.
+;;
+(define (api:add-queue-item proc cmd run-id params)
+ (mutex-lock! *api:queue-mutex*)
+ (set! *api:queue-id* (+ *api:queue-id* 1))
+ (set! *api:in-queue*
+ (cons (make-api:queue-item
+ proc: proc
+ cmd: cmd
+ run-id: run-id
+ params: params
+ id: *api:queue-id*
+ )
+ *api:in-queue*))
+ (let ((id *api:queue-id*))
+ (mutex-unlock! *api:queue-mutex*)
+ id)) ;; return id so calling proc can find the result in *api:results*
+
+;; get a queue item from the end of the queue.
+;; return #f if there are no items to be processed.
+;;
+(define (api:get-queue-item)
+ (mutex-lock! *api:queue-mutex*)
+ (let* ((res (if (null? *api:in-queue*)
+ #f
+ (let* ((revlist (reverse *api:in-queue*)))
+ (set! *api:in-queue* (reverse (cdr revlist)))
+ (car revlist)))))
+ (mutex-unlock! *api:queue-mutex*)
+ res))
+
+(define (api:put-item-in-results id item)
+ (hash-table-set! *api:results* id item))
+
+(define (api:retrieve-result-item id)
+ (let ((res (hash-table-ref/default *api:results* id #f)))
+ (if res
+ (begin
+ (hash-table-delete! *api:results* id)
+ res)
+ #f)))
+
+;; timeout is in ms, poll less frequently over time
+;;
+;; Yes, it would be better to do this with mailboxes. My last attempt to use
+;; mailboxes resulted in erratic behavior but that was likely due to something
+;; unrelated. Just to eliminate uncertainty we'll start with polling and switch
+;; to mailboxes laters.
+;;
+(define (api:wait-for-result id #!key (timeout 30000))
+ (let loop ((start (current-milliseconds)))
+ (thread-sleep! (let ((delta (- (current-milliseconds) start)))
+ (cond
+ ((< delta 500) 0.01)
+ ((< delta 5000) 0.1)
+ ((< delta 10000) 0.25)
+ (else 1.25))))
+ (let ((res (api:retrieve-result-item id)))
+ (if res
+ (api:queue-item-results res)
+ (loop start)))))
+
+(define (api:queue-run-one)
+ (let* ((item (api:get-queue-item))) ;; this removes it from the in-queue
+ (if item
+ (let* ((id (api:queue-item-id item))
+ (proc (api:queue-item-proc item))
+ (result (proc)))
+ (api:queue-item-end-time-set! item (current-seconds))
+ (api:queue-item-results-set! item result)
+ (api:put-item-in-results id item)))))
+(define (api:queue-processor)
+ (let* ((thproc (lambda ()
+ (let loop ()
+ (api:queue-run-one)
+ (thread-sleep! 0.1)
+ (loop)))))
+ (let loop ((thnum 0))
+ (thread-start! (make-thread thproc (conc "queue-thread-" thnum)))
+ (thread-sleep! 0.05)
+ (if (< thnum 20)
+ (loop (+ thnum 1))))))
+(define (api:tcp-dispatch-request-make-handler-core dbstruct api:dispatch-request)
+ (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
+ (if (not *server-signature*)
+ (set! *server-signature* (tt:mk-signature *toppath*)))
+ (lambda (indat)
+ (let* ((outer-proc (lambda (cmd run-id params)
+ (case cmd
+ ((ping) *server-signature*) ;; but ping in api:dispatch-request is (current-process-id)?
+ (else
+ (let* ((id (api:add-queue-item
+ (lambda ()
+ (api:dispatch-request dbstruct cmd run-id params))
+ cmd run-id params)))
+ (api:wait-for-result id)))))))
+ ;; (set! *api-process-request-count* numthreads)
+ (set! *db-last-access* (current-seconds))
+ (match indat
+ ((cmd run-id params meta)
+ (let* ((start-t (current-milliseconds))
+ ;; factor this out and move before this let, it is just
+ ;; an assert if not ping and dbfname is not correct
+ (db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id))
+ (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct))))
+ (case cmd
+ ((ping) #t) ;; we are fine
+ (else
+ (assert ok "FATAL: database file and run-id not aligned.")))))
+ (ttdat *server-info*)
+ (server-state (tt-state ttdat))
+ (status 'ok) ;; anything legit we can do with status?
+ (delay-wait 0)
+ (result (if (eq? cmd 'ping)
+ *server-signature* ;; (current-process-id) ;; process id or server-signature?
+ (outer-proc cmd run-id params)))
+ (meta (case cmd
+ ((ping) `((sstate . ,server-state)))
+ (else `((wait . ,delay-wait)))))
+ (errmsg "")
+ (payload (list status errmsg result meta)))
+ ;; (cmd run-id params meta)
+ (db:add-stats cmd run-id params (- (current-milliseconds) start-t))
+ payload))
+ (else
+ (assert #f "FATAL: failed to deserialize indat "indat))))))
)
Index: archive.scm
==================================================================
--- archive.scm
+++ archive.scm
@@ -359,11 +359,11 @@
(archive-dir (if archive-info (cdr archive-info) #f))
(archive-id (if archive-info (car archive-info) -1))
(home-host (server:choose-server *toppath* 'homehost))
(archive-time (seconds->std-time-str (current-seconds)))
(archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
- (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
+ (tmp-db-path (conc (dbfile:make-tmpdir-name *toppath* "") "/megatest.db"))
(dbfile (conc archive-staging-db "/megatest.db")))
(create-directory archive-staging-db #t)
(let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
(if (eq? exit-code 0)
(case archiver
@@ -407,11 +407,11 @@
(bup-restore-params (list "-d" archive-path "restore" "-C" *toppath* archive-internal-path)))
(debug:print-info 2 *default-log-port* "Restoring archived data to " *toppath* " from archive in " archive-path " ... " archive-internal-path)
(run-n-wait bup-exe params: bup-restore-params print-cmd: "Running:"))
(sleep 2)
(db:multi-db-sync
- (db:setup #t) ;; (db:setup-db *dbstruct-dbs* *toppath* #f)
+ (db:setup) ;; (db:setup-db *dbstruct-dbs* *toppath* #f)
'killservers
;'dejunk
;'adj-testids
'old2new
)
Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -21,10 +21,11 @@
(declare (unit common))
(declare (uses commonmod))
(declare (uses rmtmod))
(declare (uses debugprint))
(declare (uses mtargs))
+
(use srfi-1 data-structures posix regex-case (prefix base64 base64:)
format dot-locking csv-xml z3 udp ;; sql-de-lite
hostinfo md5 message-digest typed-records directory-utils stack
matchable regex posix (srfi 18) extras ;; tcp
@@ -139,11 +140,11 @@
(define *pkts-info* (make-hash-table)) ;; store stuff like the last parent here
(define *configinfo* #f) ;; raw results from setup, includes toppath and table from megatest.config
(define *runconfigdat* #f) ;; run configs data
(define *configdat* #f) ;; megatest.config data
(define *configstatus* #f) ;; status of data; 'fulldata : all processing done, #f : no data yet, 'partialdata : partial read done
-(define *toppath* #f)
+;; (define *toppath* #f) ;; moved to commonmod
(define *already-seen-runconfig-info* #f)
(define *test-meta-updated* (make-hash-table))
(define *globalexitstatus* 0) ;; attempt to work around possible thread issues
(define *passnum* 0) ;; when running track calls to run-tests or similar
@@ -153,14 +154,10 @@
(define *time-zero* (current-seconds)) ;; for the watchdog
(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit
(define *default-area-tag* "local")
;; DATABASE
-;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this.
-;; db stats
-(define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total >
-(define *db-stats-mutex* (make-mutex))
;; db access
(define *db-last-access* (current-seconds)) ;; last db access, used in server
;; (define *db-write-access* #t)
;; db sync
;; (define *db-last-sync* 0) ;; last time the sync to megatest.db happened
@@ -179,13 +176,12 @@
(define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg
(define *runremote* #f) ;; if set up for server communication this will hold
;; (define *max-cache-size* 0)
(define *logged-in-clients* (make-hash-table))
(define *server-id* #f)
-(define *server-info* #f) ;; good candidate for easily convert to non-global
+;; (define *server-info* #f) ;; good candidate for easily convert to non-global
(define *time-to-exit* #f)
-(define *server-run* #t)
(define *run-id* #f)
(define *server-kind-run* (make-hash-table))
(define *home-host* #f)
;; (define *total-non-write-delay* 0)
(define *heartbeat-mutex* (make-mutex))
@@ -247,11 +243,11 @@
(define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
(define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*))
(define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*))
(define (common:get-sync-lock-filepath)
- (let* ((tmp-area (common:get-db-tmp-area))
+ (let* ((tmp-area (common:make-tmpdir-name *toppath* ""))
(lockfile (conc tmp-area "/megatest.db.lock")))
lockfile))
(define *common:logpro-exit-code->status-sym-alist*
'( ( 0 . pass )
@@ -429,12 +425,11 @@
'adj-target
'new2old
'(dejunk)
))
((tcp nfs)
- (debug:print 0 *default-log-port* "WARNING: cleanup-db NOT implemented yet for tcp and nfs.")
- #;(apply db:multi-db-sync
+ (apply db:multi-db-sync
dbstruct
'schema
'killservers
'adj-target
'new2old
@@ -618,11 +613,11 @@
(common:on-homehost?))
(if (common:api-changed?)
(let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config"))
(dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db"))
(read-only (not (file-write-access? dbfile)))
- (dbstruct (db:setup #t))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t)))
+ (dbstruct (db:setup))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t)))
(debug:print 0 *default-log-port*
"WARNING: Version mismatch!\n"
" expected: " (common:version-signature) "\n"
" got: " (common:get-last-run-version))
(cond
@@ -1533,11 +1528,11 @@
;;
(define (common:lazy-modification-time fpath)
(handle-exceptions
exn
(begin
- (debug:print 0 *default-log-port* "Failed to get modifcation time for " fpath ", treating it as zero. exn=" exn)
+ (debug:print 2 *default-log-port* "Failed to get modification time for " fpath ", treating it as zero. exn=" exn)
0)
(if (file-exists? fpath)
(file-modification-time fpath)
0)))
@@ -1649,12 +1644,12 @@
;; (let loop ((x 0))
;; (print x "," (common:get-delay x 1))
;; (if (< x 2)
;; (loop (+ x 0.1)))))
-(define (get-cpu-load #!key (remote-host #f))
- (car (common:get-cpu-load remote-host)))
+;; (define (get-cpu-load #!key (remote-host #f))
+;; (car (common:get-cpu-load remote-host)))
;;======================================================================
;; (let* ((load-res (process:cmd-run->list "uptime"))
;; (load-rx (regexp "load average:\\s+(\\d+)"))
;; (cpu-load #f))
@@ -1666,18 +1661,18 @@
;; (set! cpu-load newval))))))
;; (car load-res))
;; cpu-load))
;;======================================================================
-;; get values from cached info from dropping file in logs dir
+;; get values from cached info from dropping file in .sysdata dir
;; e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 10))
(if *toppath*
(let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))
(delfile (lambda (exn)
- (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
+ (debug:print-info 2 *default-log-port* " removing bad file " fullpath ", exn=" exn)
(delete-file* fullpath)
#f)))
(if (and (file-exists? fullpath)
(file-read-access? fullpath))
(handle-exceptions
@@ -2280,11 +2275,11 @@
(define (common:check-db-dir-space)
(let* ((required (string->number
;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks.
(or (configf:lookup *configdat* "setup" "dbdir-space-required")
"1000000")))
- (dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir))
+ (dbdir (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir))
(tdbspace (common:check-space-in-dir dbdir required))
(mdbspace (common:check-space-in-dir *toppath* required)))
(sort (list tdbspace mdbspace) (lambda (a b)
(< (cadr a)(cadr b))))))
Index: commonmod.scm
==================================================================
--- commonmod.scm
+++ commonmod.scm
@@ -64,10 +64,11 @@
chicken.condition
chicken.file
chicken.file.posix
chicken.io
chicken.pathname
+ chicken.port
chicken.process
chicken.process-context
chicken.process-context.posix
chicken.sort
chicken.string
@@ -83,10 +84,12 @@
srfi-1
srfi-18
srfi-69
typed-records
system-information
+
+ debugprint
)))
;;======================================================================
;; CONTENTS
;;
@@ -132,10 +135,13 @@
(define (client:get-signature)
(if *my-client-signature* *my-client-signature*
(let ((sig (conc (get-host-name) " " (current-process-id))))
(set! *my-client-signature* sig)
*my-client-signature*)))
+
+(define *server-info* #f)
+(define *toppath* #f)
;;======================================================================
;; config file utils
;;======================================================================
@@ -160,10 +166,17 @@
'()))) ;; should it return empty list or #f to indicate not set?
(define (get-section cfgdat section)
(hash-table-ref/default cfgdat section '()))
+
+(define (common:make-tmpdir-name areapath tmpadj)
+ (let* ((area (pathname-file areapath))
+ (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb")))
+ (unless (directory-exists? dname)
+ (create-directory dname #t))
+ dname))
;; dot-locking egg seems not to work, using this for now
;; if lock is older than expire-time then remove it and try again
;; to get the lock
;;
@@ -287,13 +300,16 @@
(filter (lambda (x)
(not (string-match "^\\s*" x)))
val-list))
'())))
-(define (get-cpu-load)
- (let* ((load-info (with-input-from-file "/proc/loadavg" read-lines)))
- (map string->number (string-split load-info))))
+(define (commonmod:get-cpu-load)
+ (let* ((load-info (with-input-from-file "/proc/loadavg" read-lines))
+ (res (map string->number (string-split (car load-info)))))
+ (if (null? res)
+ #f ;; something is wrong
+ (car res))))
(define *current-host-cores* #f)
(define (get-current-host-cores)
(or *current-host-cores*
@@ -315,11 +331,11 @@
(string->number (read-line)))))
;; get the normalized (i.e. load / numcpus) for *this* host
;;
(define (get-normalized-cpu-load)
- (/ (get-cpu-load)(get-current-host-cores)))
+ (/ (commonmod:get-cpu-load)(get-current-host-cores)))
;;======================================================================
;; testsuite and area utilites
;;======================================================================
@@ -399,14 +415,13 @@
((d) 86400)
((w) 604800)
((M) 2628000) ;; aproximately one month
((y) 31536000)
(else
- 0)))))))
- ;; (print "ERROR: can't parse timestring "tstr", component "part)
- ;; can't (yet) use debugprint. rely on -show-config for user to find errors
- )))
+ 0)))))
+ (debug:print 0 *default-log-port* "ERROR: can't parse timestring "tstr", component "part", string: "(cadr match))))
+ (debug:print 0 *default-log-port* "ERROR: can't parse timestring "tstr", component "part))))
parts)
time-secs))
(define (seconds->hr-min-sec secs)
(let* ((hrs (quotient secs 3600))
Index: configf.scm
==================================================================
--- configf.scm
+++ configf.scm
@@ -31,10 +31,15 @@
(declare (uses mtargs))
(declare (uses mtargs.import))
(declare (uses common))
(declare (uses commonmod))
(declare (uses commonmod.import))
+(declare (uses dbfile))
+(declare (uses dbfile.import))
+(declare (uses dbmod))
+(declare (uses dbmod.import))
+
(import commonmod
(prefix mtargs args:)
debugprint)
(include "common_records.scm")
Index: dashboard-tests.scm
==================================================================
--- dashboard-tests.scm
+++ dashboard-tests.scm
@@ -23,10 +23,11 @@
;;======================================================================
(declare (unit dashboard-tests))
(declare (uses common))
(declare (uses commonmod))
+(declare (uses dcommon))
(declare (uses db))
(declare (uses gutils))
(declare (uses rmt))
(declare (uses ezsteps))
(declare (uses subrun))
@@ -463,11 +464,11 @@
;;======================================================================
;;
;;======================================================================
(define (dashboard-tests:examine-test run-id test-id) ;; run-id run-key origtest)
- (let* ((db-path (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
+ (let* ((db-path (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
(dbstruct #f) ;; NOT USED
(testdat (rmt:get-test-info-by-id run-id test-id)) ;; (db:get-test-info-by-id dbstruct run-id test-id))
(db-mod-time 0) ;; (file-modification-time db-path))
(last-update 0) ;; (current-seconds))
(request-update #t))
ADDED dashboard-transport-mode.scm
Index: dashboard-transport-mode.scm
==================================================================
--- /dev/null
+++ dashboard-transport-mode.scm
@@ -0,0 +1,22 @@
+;;======================================================================
+;; set up transport, db cache and sync methods
+;;
+;; sync-method: 'original, 'attach or 'none
+;; cache-method: 'tmp or 'none
+;; rmt:transport-mode: 'http, 'tcp, 'nfs
+;;
+;; NOTE: NOT ALL COMBINATIONS WORK
+;;
+;;======================================================================
+
+;; uncomment this block to test without tcp or cachedb
+;; (dbfile:sync-method 'none)
+;; (dbfile:cache-method 'none)
+;; (rmt:transport-mode 'nfs)
+
+;; uncomment this block to test with tcp and cachedb
+(dbfile:sync-method 'none) ;; original was causing crash on start.
+(dbfile:cache-method 'none)
+(rmt:transport-mode 'nfs)
+
+
Index: dashboard-transport-mode.scm.template
==================================================================
--- dashboard-transport-mode.scm.template
+++ dashboard-transport-mode.scm.template
@@ -13,10 +13,10 @@
;; (dbfile:sync-method 'none)
;; (dbfile:cache-method 'none)
;; (rmt:transport-mode 'nfs)
;; uncomment this block to test with tcp and cachedb
-(dbfile:sync-method 'attach) ;; original was causing crash on start.
+(dbfile:sync-method 'none) ;; original was causing crash on start.
(dbfile:cache-method 'none)
(rmt:transport-mode 'nfs)
Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -36,12 +36,14 @@
(declare (uses dashboard-context-menu))
(declare (uses vg))
(declare (uses subrun))
(declare (uses mt))
(declare (uses dbmod))
-(declare (uses rmtmod))
(declare (uses dbfile))
+(declare (uses dbfile.import))
+(declare (uses rmtmod))
+(declare (uses rmtmod.import))
(declare (uses commonmod))
(declare (uses commonmod.import))
(use format)
@@ -74,10 +76,12 @@
;; executables such as dashboard and mtutil
;;
(include "dashboard-transport-mode.scm")
(dbfile:db-init-proc db:initialize-main-db)
(set! rmtmod:send-receive rmt:send-receive)
+
+(debug:print-info 0 *default-log-port* "transport-mode="(rmt:transport-mode))
(define help (conc
"Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version
" license GPL, Copyright (C) Matt Welland 2012-2017
@@ -117,12 +121,12 @@
args:arg-hash
0))
(if (args:get-arg "-mode")
(let* ((mode (string->symbol (args:get-arg "-mode"))))
- (rmt:transport-mode mode))
- (rmt:transport-mode 'tcp))
+ (rmt:transport-mode mode)))
+;; (rmt:transport-mode 'tcp))
(if (args:get-arg "-test") ;; need to use tcp for test control panel
(rmt:transport-mode 'tcp))
;; RA => Might require revert for filters
@@ -139,33 +143,35 @@
;; (hash-table-set! args:arg-hash "-use-db-cache" #t)));;;)
;;)
;; data common to all tabs goes here
;;
-(defstruct dboard:commondat
- ((curr-tab-num 0) : number)
- please-update
- tabdats
- update-mutex
- updaters
- updating
- uidat ;; needs to move to tabdat at some time
- hide-not-hide-tabs
- target
- )
-
-(define (dboard:commondat-make)
- (make-dboard:commondat
- curr-tab-num: 0
- tabdats: (make-hash-table)
- please-update: #t
- update-mutex: (make-mutex)
- updaters: (make-hash-table)
- updating: #f
- hide-not-hide-tabs: #f
- target: ""
- ))
+;; Moved to dcommon.scm
+;;
+;; (defstruct dboard:commondat
+;; ((curr-tab-num 0) : number)
+;; please-update
+;; tabdats
+;; update-mutex
+;; updaters
+;; updating
+;; uidat ;; needs to move to tabdat at some time
+;; hide-not-hide-tabs
+;; target
+;; )
+;;
+;; (define (dboard:commondat-make)
+;; (make-dboard:commondat
+;; curr-tab-num: 0
+;; tabdats: (make-hash-table)
+;; please-update: #t
+;; update-mutex: (make-mutex)
+;; updaters: (make-hash-table)
+;; updating: #f
+;; hide-not-hide-tabs: #f
+;; target: ""
+;; ))
;;======================================================================
;; buttons color using image
;;======================================================================
@@ -208,39 +214,17 @@
;; (iup:attribute-set! img1 "2" "255 0 0")
(hash-table-set! images name img1)
name)))
-;; RA => returns the tabdat stored at hashkey passed in commondat-tabdats table (e.g. 0 gives summary)
-;;
-(define (dboard:common-get-tabdat commondat #!key (tab-num #f))
- (let* ((tnum (or tab-num
- (dboard:commondat-curr-tab-num commondat)
- 0)) ;; tab-num value is curr-tab-num value in passed commondat
- (ht (dboard:commondat-tabdats commondat))
- (res (hash-table-ref/default ht tnum #f)))
- (or res
- (let ((new-tabdat (dboard:tabdat-make-data)))
- (hash-table-set! ht tnum new-tabdat)
- new-tabdat))))
-
-;; RA => sets the tabdat passed to the hashkey at commondat:tabdats hash table
-;;
-(define (dboard:common-set-tabdat! commondat tabnum tabdat)
- (hash-table-set!
- (dboard:commondat-tabdats commondat)
- tabnum
- tabdat))
-
;; gets and calls updater list based on curr-tab-num
;;
(define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
;; (sync-db-to-tmp (dboard:common-get-tabdat commondat tab-num: tab-num)) ;; no longer applies
;; maybe need sleep here?
-
(if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
(let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat)))
(updaters (hash-table-ref/default (dboard:commondat-updaters commondat)
tnum
'())))
@@ -402,12 +386,12 @@
(dboard:setup-tabdat dat)
(dboard:setup-num-rows dat)
dat))
(define (dboard:setup-tabdat tabdat)
- (dboard:tabdat-dbdir-set! tabdat (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
- (dboard:tabdat-dbfpath-set! tabdat (db:dbfile-path))
+ (dboard:tabdat-dbdir-set! tabdat (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
+ (dboard:tabdat-dbfpath-set! tabdat (common:make-tmpdir-name *toppath* ""))
(dboard:tabdat-monitor-db-path-set! tabdat (conc (dboard:tabdat-dbdir tabdat) "/monitor.db"))
;; HACK ALERT: this is a hack, please fix.
(dboard:tabdat-ro-set! tabdat (not (file-read-access? (dboard:tabdat-dbfpath tabdat))))
@@ -671,11 +655,11 @@
;;
(define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)
(let* ((start-time (current-seconds))
(access-mode (dboard:tabdat-access-mode tabdat))
(num-to-get (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
- "200")))
+ "1000")))
(states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
(statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
(do-not-use-db-file-timestamps #f) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
(do-not-use-query-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
(sort-info (get-curr-sort))
@@ -749,16 +733,16 @@
(for-each
(lambda (tdat)
(let ((test-id (db:test-get-id tdat))
(state (db:test-get-state tdat)))
- (dboard:rundat-data-changed-set! run-dat #t)
- (if (equal? state "DELETED")
- (hash-table-delete! tests-ht test-id)
- (hash-table-set! tests-ht test-id tdat))))
- tmptests)
-
+ (dboard:rundat-data-changed-set! run-dat #t)
+ (if (equal? state "DELETED")
+ (hash-table-delete! tests-ht test-id)
+ (hash-table-set! tests-ht test-id tdat))))
+ tmptests)
+
tests-ht))
;; tmptests - new tests data
;; prev-tests - old tests data
;;
@@ -853,10 +837,16 @@
(loop run tal new-res newmaxtests) ;; not done getting data for this run
(loop (car tal)(cdr tal) new-res newmaxtests)))))))
(dboard:tabdat-filters-changed-set! tabdat #f)
(dboard:update-tree tabdat runs-hash header tb)))
+
+(define *dashboard-last-run-id-update* (make-hash-table)) ;; id => seconds
+
+(define (dboard:clear-run-id-update-hash)
+ (hash-table-clear! *dashboard-last-run-id-update*))
+
;; this calls dboard:get-tests-for-run-duplicate for each run
;;
;; create a virtual table of all the tests
;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") )
;;
@@ -889,63 +879,82 @@
(dboard:tabdat-item-test-names-set! tabdat '())
(hash-table-clear! (dboard:tabdat-allruns-by-id tabdat)))
(let loop ((run (car runs))
(tal (cdr runs))
(res '())
- (maxtests 0))
+ (maxtests 0)
+ (cont-run #f))
(let* ((run-id (db:get-value-by-header run header "id"))
+ (recently-done (< (- (current-seconds)
+ (hash-table-ref/default *dashboard-last-run-id-update* run-id 0)) 1))
(run-struct (hash-table-ref/default (dboard:tabdat-allruns-by-id tabdat) run-id #f))
;; (last-update (if run-struct (dboard:rundat-last-update run-struct) 0))
(key-vals (rmt:get-key-vals run-id))
- (tests-ht (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals))
+ (tests-ht (let* ((tht (if (and recently-done run-struct)
+ (let ((rht (dboard:rundat-tests run-struct))) ;; (dboard:tabdat-allruns-by-id tabdat)))
+ (or rht
+ (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)))
+ (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals))))
+ (assert (hash-table? tht) "FATAL: But here tht should be a hash-table")
+ tht))
;; GET RID OF dboard:get-tests-dat - it is superceded by dboard:get-tests-for-run-duplicate
;; dboard:get-tests-for-run-duplicate - returns a hash table
;; (dboard:get-tests-dat tabdat run-id last-update))
(all-test-ids (hash-table-keys tests-ht))
- (num-tests (length all-test-ids)))
- ;; (print "run-struct: " run-struct)
- ;; NOTE: bubble-up also sets the global (dboard:tabdat-item-test-names tabdat)
- ;; (tests (bubble-up tmptests priority: bubble-type))
- ;; NOTE: 11/01/2013 This routine is *NOT* getting called excessively.
- ;; (debug:print 0 *default-log-port* "Getting data for run " run-id " with key-vals=" key-vals)
- ;; Not sure this is needed?
- (let* ((newmaxtests (max num-tests maxtests))
- ;; (last-update (- (current-seconds) 10))
- (run-struct (or run-struct
- (dboard:rundat-make-init
- run: run
- tests: tests-ht
- key-vals: key-vals)))
- (new-res (if (null? all-test-ids)
- res
- (delete-duplicates
- (cons run-struct res)
- (lambda (a b)
- (eq? (db:get-value-by-header (dboard:rundat-run a) header "id")
- (db:get-value-by-header (dboard:rundat-run b) header "id"))))))
- (elapsed-time (- (current-seconds) start-time)))
- (if (null? all-test-ids)
+ (num-tests (length all-test-ids))
+ ;; (print "run-struct: " run-struct)
+ ;; NOTE: bubble-up also sets the global (dboard:tabdat-item-test-names tabdat)
+ ;; (tests (bubble-up tmptests priority: bubble-type))
+ ;; NOTE: 11/01/2013 This routine is *NOT* getting called excessively.
+ ;; (debug:print 0 *default-log-port* "Getting data for run " run-id " with key-vals=" key-vals)
+ ;; Not sure this is needed?
+ (newmaxtests (max num-tests maxtests))
+ ;; (last-update (- (current-seconds) 10))
+ (run-struct (or run-struct
+ (dboard:rundat-make-init
+ run: run
+ tests: tests-ht
+ key-vals: key-vals)))
+ (new-res (if (null? all-test-ids)
+ res
+ (delete-duplicates
+ (cons run-struct res)
+ (lambda (a b)
+ (eq? (db:get-value-by-header (dboard:rundat-run a) header "id")
+ (db:get-value-by-header (dboard:rundat-run b) header "id"))))))
+ (elapsed-time (- (current-seconds) start-time)))
+ (if (null? all-test-ids)
(hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
(hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
- (if (or (null? tal)
- (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
- (begin
- (when (> elapsed-time 2)
- (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
- (let* ((old-val (iup:attribute *tim* "TIME"))
- (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val)))))))
- (if (< (string->number new-val) 5000)
- (begin
- (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
- (iup:attribute-set! *tim* "TIME" new-val)))))
- (dboard:tabdat-allruns-set! tabdat new-res)
- maxtests)
- (if (> (dboard:rundat-run-data-offset run-struct) 0)
- (loop run tal new-res newmaxtests) ;; not done getting data for this run
- (loop (car tal)(cdr tal) new-res newmaxtests)))))))
- (dboard:tabdat-filters-changed-set! tabdat #f)
- (dboard:update-tree tabdat runs-hash header tb)))
+
+ (if (or (null? tal)
+ (> elapsed-time 2)) ;; stop loading data after 5
+ ;; seconds, on the next call
+ ;; more data *should* be
+ ;; loaded since
+ ;; get-tests-for-run uses last
+ ;; update
+ (begin
+ (when (> elapsed-time 2)
+ (debug:print 2 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
+ (let* ((old-val (iup:attribute *tim* "TIME"))
+ (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val)))))))
+ (if (< (string->number new-val) 5000)
+ (begin
+ (debug:print 2 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
+ (iup:attribute-set! *tim* "TIME" new-val)))))
+ (dboard:tabdat-allruns-set! tabdat new-res)
+ maxtests)
+ (if (> (dboard:rundat-run-data-offset run-struct) 0)
+ (begin
+ (thread-sleep! 0.2) ;; let the gui re-draw
+ (loop run tal new-res newmaxtests #t)) ;; not done getting data for this run
+ (begin
+ (hash-table-set! *dashboard-last-run-id-update* run-id (current-seconds))
+ (loop (car tal)(cdr tal) new-res newmaxtests #f)))))))
+ (dboard:tabdat-filters-changed-set! tabdat #f)
+ (dboard:update-tree tabdat runs-hash header tb)))
(define *collapsed* (make-hash-table))
(define (toggle-hide lnum uidat) ; fulltestname)
(let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum))
@@ -1153,15 +1162,15 @@
(drop (dboard:tabdat-all-test-names tabdat)
(dboard:tabdat-start-test-offset tabdat))
'())))
(append xl (make-list (- (dboard:tabdat-num-tests tabdat) (length xl)) ""))))
(update-labels uidat (dboard:tabdat-all-test-names tabdat))
- (for-each
+ (for-each ;;run
(lambda (rundat)
- ;; if rundat is junk clobber it with a decent placeholder
(if (or (not rundat) ;; handle padded runs
(not (dboard:rundat-run rundat)))
+ ;; Need to put an empty column in to erase previous contents.
(set! rundat (dboard:rundat-make-init
key-vals: (map (lambda (x) "")(dboard:tabdat-keys tabdat)))))
(let* ((run (dboard:rundat-run rundat))
(testsdat-by-name (dboard:rundat-tests-by-name rundat))
(key-val-dat (dboard:rundat-key-vals rundat))
@@ -1168,23 +1177,22 @@
(run-id (db:get-value-by-header run (dboard:tabdat-header tabdat) "id"))
(key-vals (append key-val-dat
(list (let ((x (db:get-value-by-header run (dboard:tabdat-header tabdat) "runname")))
(if (string? x) x "")))))
(run-key (string-intersperse key-vals "\n")))
-
+
;; fill in the run header key values
;;
- (let ((rown 0)
+ (let ((rown 0)
(headercol (vector-ref tableheader coln)))
(for-each (lambda (kval)
(let* ((labl (vector-ref headercol rown)))
(if (not (equal? kval (iup:attribute labl "TITLE")))
(iup:attribute-set! (vector-ref headercol rown) "TITLE" kval))
(set! rown (+ rown 1))))
key-vals))
-
- ;; For this run now fill in the buttons for each test
+ ;; For this run now fill in the buttons for each test
;;
(let ((rown 0)
(columndat (vector-ref table coln)))
(for-each
(lambda (testname)
@@ -1197,17 +1205,12 @@
;; testsdat)))
(if (not matching)
(vector -1 -1 "" "" "" 0 "" "" 0 "" "" "" 0 "" "")
;; (car matching))))
matching)))
- (testname (db:test-get-testname testdat))
- (itempath (db:test-get-item-path testdat))
- (testfullname (test:test-get-fullname testdat))
(teststatus (db:test-get-status testdat))
(teststate (db:test-get-state testdat))
- ;;(teststart (db:test-get-event_time test))
- ;;(runtime (db:test-get-run_duration test))
(buttontxt (cond
((member teststate '("COMPLETED" "ARCHIVED")) teststatus)
((and (equal? teststate "NOT_STARTED")
(member teststatus '("ZERO_ITEMS" "BLOCKED" "PREQ_FAIL" "PREQ_DISCARDED" "TIMED_OUT" "KEEP_TRYING" "TEN_STRIKES")))
teststatus)
@@ -1402,11 +1405,15 @@
tp)))
(states (dboard:tabdat-states tabdat))
(statuses (dboard:tabdat-statuses tabdat))
(target (let ((targ-list (dboard:tabdat-target tabdat)))
(if targ-list (string-intersperse targ-list "/") "no-target-selected")))
- (run-name (dboard:tabdat-run-name tabdat))
+ (run-name (let ((run-input (dboard:tabdat-run-name tabdat))
+ )
+ (if (equal? run-input "")
+ "no-runname-specified"
+ run-input)))
(states-str (if (or (not states)
(null? states))
""
(conc " -state " (string-intersperse states ","))))
(statuses-str (if (or (not statuses)
@@ -2404,20 +2411,21 @@
)
)) "runs-summary-click-callback"))))
(runs-summary-updater
(lambda ()
- (mutex-lock! update-mutex)
+ ;; (mutex-lock! update-mutex)
(if (or (dashboard:database-changed? commondat tabdat context-key: 'runs-summary-updater)
(dboard:tabdat-view-changed tabdat))
(debug:catch-and-dump
(lambda () ;; check that run-matrix is initialized before calling the updater
(if run-matrix
(dashboard:runs-summary-updater commondat tabdat tb cell-lookup run-matrix)))
"dashboard:runs-summary-updater")
)
- (mutex-unlock! update-mutex)))
+ #;(mutex-unlock! update-mutex)
+ ))
(runs-summary-control-panel (dashboard:runs-summary-control-panel tabdat))
)
(dboard:commondat-add-updater commondat runs-summary-updater tab-num: tab-num)
(dboard:tabdat-runs-tree-set! tabdat tb)
(iup:vbox
@@ -2460,11 +2468,11 @@
(iup:vbox
(iup:textbox #:size "120x15" #:fontsize "10" #:value "%"
#:expand "NO"
#:action (lambda (obj unk val)
(debug:catch-and-dump
- (lambda ()
+ (lambda ()57
(mark-for-update tabdat)
(update-search commondat tabdat "test-name" val))
"make-controls")))
(iup:hbox
(iup:button "Quit" #:action (lambda (obj)
@@ -2479,10 +2487,11 @@
(dboard:tabdat-allruns-by-id-set! tabdat (make-hash-table))
(dboard:tabdat-done-runs-set! tabdat '())
(dboard:tabdat-not-done-runs-set! tabdat '())
(dboard:tabdat-view-changed-set! tabdat #t)
(dboard:commondat-please-update-set! commondat #t)
+ (dboard:clear-run-id-update-hash)
(mark-for-update tabdat))
#:expand "NO" #:size "40x15")
(iup:button "Collapse" #:action (lambda (obj)
(debug:catch-and-dump
(lambda ()
@@ -3112,11 +3121,11 @@
(debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: "
((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn)
(current-seconds)) ;; something went wrong - just print an error and return current-seconds
(common:max (map (lambda (filen)
(file-modification-time filen))
- (glob (conc dbdir "/*.db*"))))))
+ (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db")))))))
(define (dashboard:monitor-changed? commondat tabdat)
(let* ((run-update-time (current-seconds))
(monitor-db-path (dboard:tabdat-monitor-db-path tabdat))
(monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path))
@@ -3137,11 +3146,11 @@
(hash-table-set! (dboard:tabdat-last-db-update tabdat) context newtime))
;;
(define (dashboard:database-changed? commondat tabdat #!key (context-key 'default))
(let* ((run-update-time (current-seconds))
- (dbdir (conc *toppath* "/.mtdb"`))
+ (dbdir (conc *toppath* "/.mtdb"))
(modtime (dashboard:get-youngest-run-db-mod-time dbdir))
(recalc (dashboard:recalc modtime
(dboard:commondat-please-update commondat)
(dboard:get-last-db-update tabdat context-key))))
(if recalc
@@ -3344,14 +3353,14 @@
(vch (dboard:tabdat-view-changed tabdat)))
(if (and cnv dwg vch)
(begin
(vg:drawing-xoff-set! dwg (dboard:tabdat-xadj tabdat))
(vg:drawing-yoff-set! dwg (dboard:tabdat-yadj tabdat))
- (mutex-lock! mtx)
+ ;; (mutex-lock! mtx)
(canvas-clear! cnv)
(vg:draw dwg tabdat)
- (mutex-unlock! mtx)
+ ;; (mutex-unlock! mtx)
(dboard:tabdat-view-changed-set! tabdat #f)))))
;; doesn't work.
;;
;;(define (gotoescape tabdat escape)
@@ -3631,17 +3640,17 @@
(graph-uly (- (calc-y 0) canvas-margin))
(sec-per-50pt (/ 50 timescale))
)
;; (print "timeoffset: " timeoffset " timescale: " timescale " run-duration: " (seconds->hr-min-sec run-duration) " width: " width " sec-per-50pt: " sec-per-50pt)
;; (print "timescale: " timescale " timeoffset: " timeoffset " sizex: " sizex " originx: " originx)
- (mutex-lock! mtx)
+ ;; (mutex-lock! mtx)
(vg:add-comp-to-lib runslib run-full-name runcomp)
;; Have to keep moving the instantiated box as it is anchored at the lower left
;; this should have worked for x in next statement? (maptime run-start)
;; add 60 to make room for the graph
(vg:instantiate drawing "runslib" run-full-name run-full-name 8 (- (calc-y curr-run-start-row) (+ 5 graph-height run-to-run-margin)))
- (mutex-unlock! mtx)
+ ;; (mutex-unlock! mtx)
;; (set! run-start-row (+ max-row 2))
;; (dboard:tabdat-start-row-set! tabdat (+ new-run-start-row 1))
;; get tests in list sorted by event time ascending
(let testsloop ((test-ids (car hierdat)) ;; loop on tests (NOTE: not items!)
(tests-tal (cdr hierdat))
@@ -3742,13 +3751,13 @@
(outln (vg:make-rect-obj -5 lly ulx uly
text: run-full-name
line-color: (vg:rgb->number 255 0 255 a: 128))))
; (vg:components-get-extents d1 c1)))
;; this is the box around the run
- (mutex-lock! mtx)
+ ;; (mutex-lock! mtx)
(vg:add-obj-to-comp runcomp outln)
- (mutex-unlock! mtx)
+ ;; (mutex-unlock! mtx)
;; this is where we have enough info to place the graph
(dboard:graph commondat tabdat tab-num -5 (+ uly 10) ulx (+ uly graph-height 3) run-start run-end timescale maptime run-full-name canvas-margin)
(dboard:tabdat-max-row-set! tabdat (+ (dboard:tabdat-max-row tabdat)(quotient (+ graph-height 40 3) row-height)))
;; (vg:instance-move drawing run-full-name 0 (dboard:tabdat-max-row tabdat))
))
@@ -3887,22 +3896,22 @@
;; tab-num: 2)
(iup:callback-set! *tim*
"ACTION_CB"
(lambda (time-obj)
(let ((update-is-running #f))
- (mutex-lock! (dboard:commondat-update-mutex commondat))
- (set! update-is-running (dboard:commondat-updating commondat))
- (if (not update-is-running)
- (dboard:commondat-updating-set! commondat #t))
- (mutex-unlock! (dboard:commondat-update-mutex commondat))
- (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
+ ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+ (set! update-is-running (dboard:commondat-updating commondat))
+ (if (not update-is-running)
+ (dboard:commondat-updating-set! commondat #t))
+ ;; (mutex-unlock! (dboard:commondat-update-mutex commondat))
+ (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
(begin
(dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
- (mutex-lock! (dboard:commondat-update-mutex commondat))
+ ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
(dboard:commondat-updating-set! commondat #f)
- (mutex-unlock! (dboard:commondat-update-mutex commondat)))
- ))
+ ;; (mutex-unlock! (dboard:commondat-update-mutex commondat))
+ )))
1))))
;; (debug:print 0 *default-log-port* "Starting updaters")
(let ((th1 (make-thread (lambda ()
(thread-sleep! 1)
(dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab
@@ -3923,11 +3932,11 @@
(define (sync-db-to-tmp tabdat)
(let* ((db-file "./.mtdb/main.db"))
(if (and (not (file-write-access? db-file)) ( > (current-seconds) (+ last-copy-time 5)))
(begin
- (db:multi-db-sync (db:setup #f) 'old2new)
+ (db:multi-db-sync (db:setup) 'old2new)
(set! last-copy-time (current-seconds))
)
)
)
)
@@ -3944,11 +3953,14 @@
(exit 1))))
'("MT_RUN_AREA_HOME" "MT_MEGATEST" "MT_CMDINFO" "MT_TEST_RUN_DIR" "MT_LINKTREE" "MT_TESTSUITENAME"))
)
)
-(setenv "MT_RUN_AREA_HOME" (get-environment-variable "PWD"))
+;; This is NOT good
+;; (setenv "MT_RUN_AREA_HOME" (get-environment-variable "PWD"))
+;; This should be OK but it really should not be necessary
+(setenv "MT_RUN_AREA_HOME" (current-directory))
(if (not (null? remargs))
(if remargs
(begin
(debug:print 0 *default-log-port* "Unrecognised arguments: " (string-intersperse remargs " "))
Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -131,15 +131,15 @@
(debug:print-error 0 *default-log-port* " query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn)
(print-call-chain (current-error-port))
default)))
(apply sqlite3:first-result db stmt params)))
-(define (db:setup do-sync)
+(define (db:setup)
(assert *toppath* "FATAL: db:setup called before launch:setup has been run.")
- (let* ((tmpdir (common:get-db-tmp-area)))
+ (let* ((tmpdir (common:make-tmpdir-name *toppath* "")))
(if (not *dbstruct-dbs*)
- (dbfile:setup do-sync *toppath* tmpdir)
+ (dbfile:setup (conc *toppath* "/.mtdb") tmpdir)
*dbstruct-dbs*)))
;; moved from dbfile
;;
;; ADD run-id SUPPORT
@@ -267,17 +267,10 @@
", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
", location: " ((condition-property-accessor 'exn 'location) exn)
))
-;; NB// #f => return dbdir only
-;; (was planned to be; zeroth db with name=main.db)
-;;
-;; If run-id is #f return to create and retrieve the path where the db will live.
-;;
-(define db:dbfile-path common:get-db-tmp-area)
-
(define (db:set-sync db)
(let ((syncprag (configf:lookup *configdat* "setup" "sychronous")))
(sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";"))))
@@ -467,11 +460,11 @@
(get-mtime shm-file))))
;; (define (db:all-db-sync dbstruct)
;; (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db))
;; (data-synced 0) ;; count of changed records
-;; (tmp-area (common:get-db-tmp-area))
+;; (tmp-area (common:make-tmpdir-name *toppath*))
;; (dbfiles (glob (conc tmp-area"/.mtdb/*.db")))
;; (sync-durations (make-hash-table))
;; (no-sync-db (db:open-no-sync-db)))
;; (for-each
;; (lambda (file) ;; tmp db file
@@ -528,23 +521,63 @@
;; (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))
;; )
;; #t)
(define (db:kill-servers)
- (let* ((servers (server:choose-server *toppath* 'all-valid))) ;; (server:get-list *toppath*))
- (for-each
- (lambda (server)
- (handle-exceptions
- exn
- (begin
- (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " server ", exn=" exn)
- #f)
- (match-let (((mod-time host port start-time server-id pid) server))
- (if (and host pid)
- (tasks:kill-server host pid)))))
- servers)
- (delete-file* (common:get-sync-lock-filepath))))
+ (let* ((tl (launch:setup)) ;; need this to initialize *toppath*
+ (servdir (conc *toppath* "/.servinfo"))
+ (servfiles (glob (conc servdir "/*:*.db")))
+ (fmtstr "~10a~22a~10a~25a~25a~8a\n")
+ (dbfiles (append (glob (conc *toppath* "/.mtdb/main.db")) (glob (conc *toppath* "/.mtdb/?.db"))(glob (conc *toppath* "/.mtdb/??.db"))))
+ (ttdat (make-tt areapath: *toppath*))
+ )
+ (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state")
+ (for-each
+ (lambda (dbfile)
+ (let* (
+ (dbfname (conc (pathname-file dbfile) ".db"))
+ (sfiles (tt:find-server *toppath* dbfname))
+ )
+ (for-each
+ (lambda (sfile)
+ (let (
+ (sinfos (tt:get-server-info-sorted ttdat dbfname))
+ )
+ (for-each
+ (lambda (sinfo)
+ (let* (
+ (db (list-ref sinfo 5))
+ (pid (list-ref sinfo 4))
+ (host (list-ref sinfo 0))
+ (port (list-ref sinfo 1))
+ (server-id (list-ref sinfo 3))
+ (age (seconds->hr-min-sec (- (current-seconds) (list-ref sinfo 2))))
+ (last-mod (seconds->string (list-ref sinfo 2)))
+ (killed (system (conc "ssh " host " kill " pid " > /dev/null")))
+ (dummy2 (sleep 1))
+ (state (if (> (system (conc "ssh " host " ps " pid " > /dev/null")) 0) "dead" "alive"))
+ )
+ (format #t fmtstr db (conc host ":" port) pid age last-mod state)
+ (system (conc "rm " sfile))
+ )
+ )
+ sinfos
+ )
+ )
+ )
+ sfiles
+ )
+ )
+ )
+ dbfiles
+ )
+ ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id.
+ (if (file-exists? (conc *toppath* "/.mtdb/no-sync.db"))
+ (delete-file (conc *toppath* "/.mtdb/no-sync.db"))
+ )
+ )
+)
;; options:
;;
;; 'killservers - kills all servers
;; 'dejunk - removes junk records
@@ -556,34 +589,34 @@
;; run-ids: '(1 2 3 ...) or #f (for all)
;;
(define (db:multi-db-sync dbstruct . options)
(let* (;; (dbdat (db:open-db dbstruct #f dbfile:db-init-proc))
(data-synced 0) ;; count of changed records
- (tmp-area (common:get-db-tmp-area))
+ (tmp-area (common:make-tmpdir-name *toppath* ""))
(old2new (member 'old2new options))
(dejunk (member 'dejunk options))
(killservers (member 'killservers options))
(src-area (if old2new *toppath* tmp-area))
- (dest-area (if old2new tmp-area *toppath*))
+ (dest-area (if old2new tmp-area (conc *toppath* "/.mtdb")))
(dbfiles (if old2new (glob (conc *toppath* "/.mtdb/*.db"))
- (glob (conc tmp-area "/.mtdb/*.db"))))
+ (glob (conc tmp-area "/*.db"))))
(keys (db:get-keys dbstruct))
(sync-durations (make-hash-table)))
;; kill servers
- (if killservers (db:kill-servers))
+ ;; (if killservers (db:kill-servers))
(if (not dbfiles)
(debug:print-error 0 *default-log-port* "no dbfiles found in " (conc *toppath* "/.mtdb"))
(for-each
(lambda (srcfile)
(debug:print-info 3 *default-log-port* "file: " srcfile)
(let* ((fname (conc (pathname-file srcfile) ".db"))
(basename (pathname-file srcfile))
(run-id (if (string= basename "main") #f (string->number basename)))
- (destfile (conc dest-area "/.mtdb/" fname))
- (dest-directory (conc dest-area "/.mtdb/"))
+ (destfile (conc dest-area "/" fname))
+ (dest-directory dest-area)
(time1 (file-modification-time srcfile))
(time2 (if (file-exists? destfile)
(begin
(debug:print-info 2 *default-log-port* "destfile " destfile " exists")
(file-modification-time destfile))
@@ -603,33 +636,37 @@
#t)
((and changed *time-to-exit*) ;; last sync
#t)
(else
#f))))
- (if (or dejunk do-cp)
+
+ (if (or dejunk do-cp)
(let* ((start-time (current-milliseconds))
- ;; subdb is misnamed - should be dbdat (I think...)
- (subdb (dbfile:open-db dbstruct run-id dbfile:db-init-proc))
- ;; (or (dbfile:get-subdb dbstruct run-id)
- ;; (dbfile:init-subdb dbstruct run-id dbfile:db-init-proc)))
+ (subdb (or (dbfile:get-subdb dbstruct run-id) (dbfile:init-subdb dbstruct run-id dbfile:db-init-proc)))
+ (dbdat (or (dbfile:get-dbdat dbstruct run-id) (dbfile:open-db dbstruct run-id dbfile:db-init-proc)))
(mtdb (dbr:subdb-mtdbdat subdb))
;;
;; BUG: -mrw- I think this next line is wrong. run-id should be the path to .mtdb/.db
;;
(tmpdb (dbfile:open-db dbstruct run-id dbfile:db-init-proc)))
+ (if dejunk
+ (begin
+ (debug:print 0 *default-log-port* "Cleaning tmp DB")
+ (db:clean-up run-id tmpdb)
+ (debug:print 0 *default-log-port* "Cleaning nfs DB")
+ (db:clean-up run-id mtdb)
+ )
+ )
(debug:print-info 2 *default-log-port* "delta syncing file: " srcfile ", time diff: " (- time1 time2) " seconds")
(if old2new
(begin
- (if dejunk (db:clean-up run-id mtdb))
(db:sync-tables (db:sync-all-tables-list
- dbstruct
(db:get-keys dbstruct))
#f mtdb tmpdb))
(begin
- (if dejunk (db:clean-up run-id tmpdb))
- (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) #f tmpdb mtdb)))
+ (db:sync-tables (db:sync-all-tables-list (db:get-keys dbstruct)) #f tmpdb mtdb)))
(hash-table-set! sync-durations (conc srcfile ".db") (- (current-milliseconds) start-time)))
(debug:print-info 2 *default-log-port* "skipping delta sync. " srcfile " is up to date"))))
dbfiles))
data-synced))
@@ -641,11 +678,11 @@
(for-each
(lambda (subdb)
(let* ((mtdb (dbr:subdb-mtdb subdb))
(tmpdb (db:get-subdb dbstruct run-id))
(refndb (dbr:subdb-refndb subdb))
- (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb)))
+ (newres (db:sync-tables (db:sync-all-tables-list (db:get-keys dbstruct)) last-update tmpdb refndb mtdb)))
;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb)
;; BUG: verify this is really needed
(dbfile:add-dbdat dbstruct run-id tmpdb)
(set! res (cons newres res))))
subdbs)
@@ -1149,16 +1186,19 @@
;; 2. Look at run records
;; a. If have tests that are not deleted, set state='unknown'
;; b. ....
;;
(define (db:clean-up run-id dbdat)
- (debug:print 2 *default-log-port* "db:clean-up")
-
-
(if run-id
- (db:clean-up-rundb dbdat)
- (db:clean-up-maindb dbdat)
+ (begin
+ (debug:print 0 *default-log-port* "Cleaning run DB " run-id)
+ (db:clean-up-rundb dbdat run-id)
+ )
+ (begin
+ (debug:print 0 *default-log-port* "Cleaning main DB ")
+ (db:clean-up-maindb dbdat)
+ )
)
)
;; Clean out old junk and vacuum the database
@@ -1170,38 +1210,42 @@
;; b. If test dir gone, delete the test record
;; 2. Look at run records
;; a. If have tests that are not deleted, set state='unknown'
;; b. ....
;;
-(define (db:clean-up-rundb dbdat)
+(define (db:clean-up-rundb dbdat run-id)
;; (debug:print 0 *default-log-port* "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db")
(let* ((db (dbr:dbdat-dbh dbdat))
- (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests);"))
+ (test-count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests);"))
+ (step-count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM test_steps);"))
(statements
(map (lambda (stmt)
(sqlite3:prepare db stmt))
(list
- ;; delete all tests that belong to runs that are 'deleted'
- ;; (conc "DELETE FROM tests WHERE run_id NOT IN (" (string-intersperse (map conc valid-runs) ",") ");")
- ;; delete all tests that are 'DELETED'
"DELETE FROM tests WHERE state='DELETED';"
+ "DELETE FROM test_steps WHERE status = 'DELETED';"
+ "DELETE FROM tests WHERE run_id IN (SELECT id FROM runs WHERE state = 'deleted');"
))))
- ;; (db:delay-if-busy dbdat)
(sqlite3:with-transaction
db
(lambda ()
(sqlite3:for-each-row (lambda (tot)
- (debug:print-info 0 *default-log-port* "Records count before clean: " tot))
- count-stmt)
+ (debug:print-info 0 *default-log-port* "Test records count before clean: " tot))
+ test-count-stmt)
+ (sqlite3:for-each-row (lambda (tot)
+ (debug:print-info 0 *default-log-port* "Test_step records count before clean: " tot))
+ step-count-stmt)
(map sqlite3:execute statements)
(sqlite3:for-each-row (lambda (tot)
- (debug:print-info 0 *default-log-port* "Records count after clean: " tot))
- count-stmt)))
+ (debug:print-info 0 *default-log-port* "Test records count after clean: " tot))
+ test-count-stmt)
+ (sqlite3:for-each-row (lambda (tot)
+ (debug:print-info 0 *default-log-port* "Test_step records count after clean: " tot))
+ step-count-stmt)))
(map sqlite3:finalize! statements)
- (sqlite3:finalize! count-stmt)
- ;; (db:find-and-mark-incomplete db)
- ;; (db:delay-if-busy dbdat)
+ (sqlite3:finalize! test-count-stmt)
+ (sqlite3:finalize! step-count-stmt)
(sqlite3:execute db "VACUUM;")))
;; Clean out old junk and vacuum the database
;;
;; Ultimately do something like this:
@@ -1235,15 +1279,15 @@
;; (db:delay-if-busy dbdat)
(sqlite3:with-transaction
db
(lambda ()
(sqlite3:for-each-row (lambda (tot)
- (debug:print-info 0 *default-log-port* "Records count before clean: " tot))
+ (debug:print-info 0 *default-log-port* "Run records count before clean: " tot))
count-stmt)
(map sqlite3:execute statements)
(sqlite3:for-each-row (lambda (tot)
- (debug:print-info 0 *default-log-port* "Records count after clean: " tot))
+ (debug:print-info 0 *default-log-port* "Run records count after clean: " tot))
count-stmt)))
(map sqlite3:finalize! statements)
(sqlite3:finalize! count-stmt)
;; (db:find-and-mark-incomplete db)
;; (db:delay-if-busy dbdat)
@@ -1254,11 +1298,11 @@
;; no-sync.db - small bits of data to be shared between servers
;;======================================================================
(define (db:get-dbsync-path)
(case (rmt:transport-mode)
- ((http)(common:get-db-tmp-area))
+ ((http)(common:make-tmpdir-name *toppath* ""))
((tcp) (conc *toppath*"/.mtdb"))
((nfs) (conc *toppath*"/.mtdb"))
(else "/tmp/dunno-this-gonna-exist")))
;; This is needed for api.scm
@@ -1419,62 +1463,85 @@
#f
(simple-run-id (car runs)))))
;; called with run-id=#f so will operate on main.db
;;
-(define (db:insert-run dbstruct target runname run-meta)
+(define (db:insert-run dbstruct run-id target runname run-meta)
(let* ((keys (db:get-keys dbstruct))
(runs (db:simple-get-runs dbstruct runname #f #f target #f))) ;; runpatt count offset target last-update
;; need to insert run based on target and runname
(let* ((targvals (string-split target "/"))
(keystr (string-intersperse keys ","))
(key?str (string-intersperse (make-list (length targvals) "?") ","))
- (qrystr (conc "INSERT INTO runs (runname,"keystr") VALUES (?,"key?str")"))
+ (qrystr (conc "INSERT INTO runs (id,runname,"keystr") VALUES (?,?,"key?str")"))
(get-var (lambda (db qrystr)
(let* ((res #f))
(sqlite3:for-each-row
(lambda row
(set res (car row)))
- db qrystr runname)
+ db qrystr run-id runname)
res))))
(if (null? runs)
- (db:create-initial-run-record dbstruct runname target))
- (let* ((run-id (db:get-run-id dbstruct runname target)))
- (db:with-db
+ (begin
+ (db:create-initial-run-record dbstruct run-id runname target)
+ )
+ )
+ (let* ()
+ ;;(debug:print 0 *default-log-port* "db:insert-run: Calling db:with-db to update the run record")
+ (debug:print 0 *default-log-port* "db:insert-run: runid = " run-id)
+#; (db:with-db
dbstruct
#f #t
(lambda (dbdat db)
+ (debug:print 0 *default-log-port* "In the lambda proc for " dbdat " " db)
(for-each
(lambda (keyval)
+ (debug:print 0 *default-log-port* "In the lambda proc for " keyval)
(let* ((fieldname (car keyval))
(getqry (conc "SELECT "fieldname" FROM runs WHERE id=?;"))
(setqry (conc "UPDATE runs SET "fieldname"=? WHERE id=?;"))
(val (cdr keyval))
(valnum (if (number? val)
val
(if (string? val)
(string->number val)
#f))))
+ (debug:print 0 *default-log-port* "fieldname " fieldname " val " val " valnum " valnum)
(if (not (member fieldname (cons "runname" keys))) ;; don't attempt to tweak these
(let* ((curr-val (get-var db getqry))
(have-it (or (equal? curr-val val)
(equal? curr-val valnum))))
+ (debug:print 0 *default-log-port* "have-it = " have-it)
(if (not have-it)
- (sqlite3:execute db setqry (or valnum val) run-id))))))
+ (begin
+ (debug:print 0 *default-log-port* "Do sqlite3:execute")
+ ;; (sqlite3:execute db setqry (or valnum val) run-id)
+ )
+ )
+ )
+ )
+ (debug:print 0 *default-log-port* "Done with update")
+ )
+ (debug:print 0 *default-log-port* "next keyval")
+ )
run-meta)))
run-id))))
-(define (db:create-initial-run-record dbstruct runname target)
+(define (db:create-initial-run-record dbstruct run-id runname target)
(let* ((keys (db:get-keys dbstruct))
(targvals (string-split target "/"))
(keystr (string-intersperse keys ","))
- (key?str (string-intersperse (make-list (length targvals) "?") ","))
- (qrystr (conc "INSERT INTO runs (runname,"keystr") VALUES (?,"key?str")")))
+ (key?str (string-intersperse (make-list (length targvals) "?") ",")) ;; a string with the same length as targvals, where each element is "?" and interspersed with commas.
+ (qrystr (conc "INSERT INTO runs (id,runname,"keystr") VALUES (?,?,"key?str")")))
+ (debug:print 0 *default-log-port* "db:create-initial-run-record")
+ (debug:print 0 *default-log-port* "qrystr = " qrystr)
+
(db:with-db
- dbstruct #f #t
+ dbstruct #f #t ;; run-id writable
(lambda (dbdat db)
- (apply sqlite3:execute db qrystr runname targvals)))))
+ (debug:print 0 *default-log-port* "lambda proc: dbdat: " dbdat " db: " db)
+ (apply sqlite3:execute db qrystr run-id runname targvals)))))
(define (db:insert-test dbstruct run-id test-rec)
(let* ((testname (alist-ref "testname" test-rec equal?))
(item-path (alist-ref "item_path" test-rec equal?))
(id (db:get-test-id dbstruct run-id testname item-path))
@@ -1484,11 +1551,11 @@
(conc (car dat)"=?"))
fieldvals)
",")" WHERE id=?;"))
(insqry (conc "INSERT INTO tests ("(string-intersperse (map (lambda (x) (car x)) fieldvals) ",")
") VALUES ("(string-intersperse (make-list (length fieldvals) "?") ",")");")))
- (debug:print 0 *default-log-port* "id: "id"\nset: "setqry"\ninsqry: "insqry)
+ ;; (debug:print 0 *default-log-port* "id: "id"\nset: "setqry"\ninsqry: "insqry)
(db:with-db
dbstruct
run-id #t
(lambda (dbdat db)
(if id
@@ -1580,11 +1647,11 @@
;; TODO: Switch this to use max(update_time) from each run db? Then if using a server there is no disk traffic (using cachedb db) ???
;;
;; NOTE: This DOESN'T (necessarily) get the real run ids, but the number of the .db!!
(define (db:get-changed-run-ids since-time)
- (let* ((dbdir (db:dbfile-path)) ;; (configf:lookup *configdat* "setup" "dbdir"))
+ (let* ((dbdir (common:make-tmpdir-name *toppath* "")) ;; (configf:lookup *configdat* "setup" "dbdir"))
(alldbs (glob (conc *toppath* "/.mtdb/[0-9]*.db*")))
(changed (filter (lambda (dbfile)
(> (file-modification-time dbfile) since-time))
alldbs)))
(delete-duplicates
@@ -2234,21 +2301,24 @@
qry
run-id
(or last-update 0))))))
(define (db:get-testinfo-state-status dbstruct run-id test-id)
- (let ((res #f))
- (db:with-db dbstruct run-id #f
- (lambda (dbdat db)
- (sqlite3:for-each-row
- (lambda (run-id testname item-path state status)
- ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment
- (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-")))
- db
- "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;"
- test-id run-id)))
- res))
+ (db:with-db
+ dbstruct run-id #f
+ (lambda (dbdat db)
+ (let* ((res #f)
+ (stmth (db:get-cache-stmth dbdat db "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;")))
+ (sqlite3:for-each-row
+ (lambda (run-id testname item-path state status)
+ ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment
+ (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-")))
+ ;; db
+ ;; "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;"
+ stmth
+ test-id run-id)
+ res))))
;; get a useful subset of the tests data (used in dashboard
;; use db:mintest-get-{id ,run_id,testname ...}
;;
(define (db:get-tests-for-run-mindata dbstruct run-id testpatt states statuses not-in)
@@ -2279,25 +2349,37 @@
dbstruct run-id #t
(lambda (dbdat db)
(sqlite3:execute db "UPDATE tests SET state='DELETED',status='n/a',comment='' WHERE id=?;" test-id))))
;;
-(define (db:delete-old-deleted-test-records dbstruct)
- (let ((targtime (- (current-seconds)
- (or (configf:lookup-number *configdat* "setup" "keep-deleted-records")
- (* 30 24 60 60))))) ;; one month in the past
+(define (db:delete-old-deleted-test-records dbstruct run-id)
+ (let* ((targtime (- (current-seconds)
+ (or (configf:lookup-number *configdat* "setup" "keep-deleted-records")
+ (* 7 24 60 60)))) ;; cleanup if over one week old
+ (mtdbfile (dbmod:run-id->full-dbfname dbstruct run-id))
+ (qry1 "DELETE FROM test_steps WHERE test_id IN (SELECT id FROM tests WHERE state='DELETED' AND event_time);")
+ (qry2 "DELETE FROM test_data WHERE test_id IN (SELECT id FROM tests WHERE state='DELETED' AND event_time);")
+ (qry3 "DELETE FROM tests WHERE state='DELETED' AND event_time;")
+ (delproc (lambda (db)
+ (sqlite3:with-transaction
+ db
+ (lambda ()
+ (sqlite3:execute db qry1 targtime)
+ (sqlite3:execute db qry2 targtime)
+ (sqlite3:execute db qry3 targtime))))))
+ ;; first the /tmp db
(db:with-db
dbstruct
- 0
+ run-id
#t
(lambda (dbdat db)
- (sqlite3:with-transaction
- db
- (lambda ()
- (sqlite3:execute db "DELETE FROM test_steps WHERE test_id IN (SELECT id FROM tests WHERE state='DELETED' AND event_time);" targtime)
- (sqlite3:execute db "DELETE FROM test_data WHERE test_id IN (SELECT id FROM tests WHERE state='DELETED' AND event_time);" targtime)
- (sqlite3:execute db "DELETE FROM tests WHERE state='DELETED' AND event_time;" targtime)))))))
+ (delproc db)))
+ (if (and (file-exists? mtdbfile)
+ (file-write-access? mtdbfile))
+ (let* ((db (sqlite3:open-database mtdbfile)))
+ (delproc db)
+ (sqlite3:finalize! db)))))
;; set tests with state currstate and status currstatus to newstate and newstatus
;; use currstate = #f and or currstatus = #f to apply to any state or status respectively
;; WARNING: SQL injection risk. NB// See new but not yet used "faster" version below
;;
@@ -2637,18 +2719,18 @@
(db:with-db
dbstruct
run-id
#f
(lambda (dbdat db)
- (let ((res (cons #f #f)))
-;; (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;")))
- (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
- (lambda (state status)
- (cons state status))
- db
- "SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue
- test-id run-id)
+ (let ((res (cons #f #f))
+ (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=? AND run_id=?;")))
+ (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
+ (lambda (state status)
+ (cons state status))
+ ;; db
+ stmth ;;"SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue
+ test-id run-id)
res))))
;; Use db:test-get* to access
;; Get test data using test_ids. NB// Only works within a single run!!
;;
@@ -3728,25 +3810,35 @@
tags)))
db
"SELECT testname,tags FROM test_meta")
(hash-table->alist res)))))
+;; testmeta doesn't change, we can cache it for up too an hour
+
+(define *db:testmeta-cache* (make-hash-table))
+(define *db:testmeta-last-update* 0)
+
;; read the record given a testname
(define (db:testmeta-get-record dbstruct testname)
- (let ((res #f))
- (db:with-db
- dbstruct
- #f
- #f
- (lambda (dbdat db)
- (sqlite3:for-each-row
- (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)
- (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)))
- db
- "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;"
- testname)
- res))))
+ (if (and (< (- (current-seconds) *db:testmeta-last-update*) 600)
+ (hash-table-exists? *db:testmeta-cache* testname))
+ (hash-table-ref *db:testmeta-cache* testname)
+ (let ((res #f))
+ (db:with-db
+ dbstruct
+ #f
+ #f
+ (lambda (dbdat db)
+ (sqlite3:for-each-row
+ (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)
+ (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)))
+ db
+ "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;"
+ testname)))
+ (hash-table-set! *db:testmeta-cache* testname res)
+ (set! *db:testmeta-last-update* (current-seconds))
+ res)))
;; create a new record for a given testname
(define (db:testmeta-add-record dbstruct testname)
(db:with-db dbstruct #f #t
(lambda (dbdat db)
@@ -4314,11 +4406,11 @@
))))
;; sync for filesystem local db writes
;;
(define (db:run-lock-and-sync no-sync-db)
- (let* ((tmp-area (common:get-db-tmp-area))
+ (let* ((tmp-area (common:make-tmpdir-name *toppath* ""))
(dbfiles (glob (conc tmp-area"/.mtdb/*.db")))
(sync-durations (make-hash-table)))
;; (debug:print-info 0 *default-log-port* "lock-and-sync, dbfiles: "dbfiles)
(for-each
(lambda (file)
@@ -4370,11 +4462,11 @@
(sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
(debug-mode (debug:debug-mode 1))
(last-time (current-seconds)) ;; last time through the sync loop
(no-sync-db (db:open-no-sync-db))
(sync-duration 0) ;; run time of the sync in milliseconds
- (tmp-area (common:get-db-tmp-area)))
+ (tmp-area (common:make-tmpdir-name *toppath* "")))
;; Sync moved to http-transport keep-running loop
(debug:print-info 2 *default-log-port* "Periodic copy-based sync thread started. syncer is copy-sync, tmp-area is " tmp-area)
(debug:print-info 3 *default-log-port* "watchdog starting. syncer is copy-sync pid="(current-process-id));; " this-wd-num="this-wd-num)
(if (and legacy-sync (not *time-to-exit*))
@@ -4478,11 +4570,11 @@
(for-each
(lambda (subdb)
(let* (;;(dbstruct (db:setup))
(mtdb (dbr:subdb-mtdb subdb))
(mtpath (db:dbdat-get-path mtdb))
- (tmp-area (common:get-db-tmp-area))
+ (tmp-area (common:make-tmpdir-name *toppath* ""))
(res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
(set! sync-duration (- (current-milliseconds) sync-start))
(if (> res 0) ;; some records were transferred, keep the db alive
(begin
(mutex-lock! *heartbeat-mutex*)
@@ -4526,11 +4618,10 @@
;; (db:no-sync-close-db no-sync-db stmt-cache)
(if (common:low-noise-print 30)
(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))
))
-
(define (std-exit-procedure)
;;(common:telemetry-log-close)
(on-exit (lambda () 0)) ;; why is this here?
;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
(let ((no-hurry (if *time-to-exit* ;; hurry up
@@ -4539,11 +4630,11 @@
(set! *time-to-exit* #t)
#t))))
(debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.")
(if (and no-hurry
(debug:debug-mode 18))
- (rmt:print-db-stats))
+ (dbmod:print-db-stats))
(let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds
(if *dbstruct-dbs* (db:close-all *dbstruct-dbs*)) ;; one second allocated
(if (list? *on-exit-procs*)
(for-each
(lambda (proc)
Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -24,13 +24,16 @@
(declare (uses debugprint))
(declare (uses commonmod))
(module dbfile
*
-
- (import scheme
- chicken
+(import scheme)
+
+(cond-expand
+ (chicken-4
+
+ (import chicken
data-structures
extras
matchable
(prefix sqlite3 sqlite3:)
@@ -45,11 +48,54 @@
hostinfo
commonmod
debugprint
)
+ )
+ (chicken-5
+ (import (prefix sqlite3 sqlite3:)
+ ;; data-structures
+ ;; extras
+ ;; files
+ ;; posix
+ ;; posix-extras
+ chicken.base
+ chicken.condition
+ chicken.file
+ chicken.file.posix
+ chicken.format
+ chicken.io
+ chicken.pathname
+ chicken.port
+ chicken.process
+ chicken.process-context
+ chicken.process-context.posix
+ chicken.sort
+ chicken.string
+ chicken.time
+ chicken.time.posix
+
+ matchable
+ md5
+ message-digest
+ pathname-expand
+ regex
+ regex-case
+ srfi-1
+ srfi-18
+ srfi-69
+ typed-records
+ stack
+ system-information
+ commonmod
+ debugprint
+ )
+ (define file-write-access? file-writable?)
+ (define file-move move-file)
+ ))
+
;; parameters
;;
(define dbfile:testsuite-name (make-parameter #f))
(define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic
@@ -242,11 +288,12 @@
#f
)
)
(define (dbfile:make-tmpdir-name areapath tmpadj)
- (let* ((dname (conc "/tmp/"(current-user-name)"/" (string-translate areapath "/" ".") tmpadj)))
+ (let* ((area (pathname-file areapath))
+ (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb")))
(unless (directory-exists? dname)
(create-directory dname #t))
dname))
(define (dbfile:run-id->path apath run-id)
@@ -267,18 +314,18 @@
(define (dbfile:run-id->dbfname run-id)
(conc (dbfile:run-id->dbnum run-id)".db"))
;; the path in MTRAH with the filename
(define (dbfile:run-id->dbname run-id)
- (conc ".mtdb/"(dbfile:run-id->dbfname run-id)))
+ (conc (dbfile:run-id->dbfname run-id)))
;; Make the dbstruct, setup up auxillary db's and call for main db at least once
;;
;; called in http-transport and replicated in rmt.scm for *local* access.
;;
-(define (dbfile:setup do-sync areapath tmppath)
+(define (dbfile:setup areapath tmppath)
(cond
(*dbstruct-dbs*
(dbfile:print-err "WARNING: dbfile:setup called when *dbstruct-dbs* is already initialized")
*dbstruct-dbs*) ;; TODO: when multiple areas are supported, this optimization will be a hazard
(else
@@ -358,11 +405,12 @@
(if dbdat
dbdat
(let* ((tmppath (dbr:dbstruct-tmppath dbstruct))
(tmpdbpath (dbfile:run-id->path tmppath run-id))
(dbdat (dbfile:open-sqlite3-db tmpdbpath init-proc sync-mode: 0 journal-mode: "WAL")))
- ;; the following line short-circuits the "one db handle per thread" model
+
+ ;; the following line short-circuits the "one db handle per thread" model
;;
;; (dbfile:add-dbdat dbstruct run-id dbdat)
;;
dbdat))))))
@@ -445,19 +493,20 @@
(if journal-mode
(sqlite3:execute db (conc "PRAGMA journal_mode = "journal-mode";")))
(if (and init-proc (or force-init
(not db-exists)))
(init-proc db))
- db)))
+ db))
+ expire-time: 5)
(begin
(if (file-exists? fname )
(let ((db (sqlite3:open-database fname)))
;; pragmas synchronous not needed because this db is used read-only
;; (sqlite3:execute db (conc "PRAGMA synchronous = "mode";")
(sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000)) ;; read-only but still need timeout
db )
- (print "file doesn't exist: " fname))))
+ (print "cautious-open-database: file doesn't exist: " fname))))
(exn (io-error)
(dbfile:print-err exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.")
(retry))
(exn (corrupt)
(dbfile:print-err exn "ERROR: database " fname " is corrupt. Repair it to proceed.")
@@ -487,11 +536,11 @@
;; NOTE: this is already protected by mutex *no-sync-db-mutex*
;;
(define (dbfile:raw-open-no-sync-db dbpath)
(if (not (file-exists? dbpath))
(create-directory dbpath #t))
- (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db")
+ (debug:print-info 2 *default-log-port* "(dbfile:raw-open-no-sync-db: Opening "dbpath"/no-sync.db")
(let* ((dbname (conc dbpath "/no-sync.db"))
(db-exists (file-exists? dbname))
(init-proc (lambda (db)
(sqlite3:with-transaction
db
@@ -525,18 +574,19 @@
reason TEXT DEFAULT 'none',
CONSTRAINT no_sync_processes UNIQUE (host,pid));"
))))))
(on-tmp (equal? (car (string-split dbpath "/")) "tmp"))
(db (if on-tmp
- (dbfile:cautious-open-database dbname init-proc 0 "WAL" force-init: #t)
- (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t)
+ (dbfile:cautious-open-database dbname init-proc 1 "WAL" force-init: #t) ;; WAL MODE should use syncronous=1
+ ;; (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t)
+ (dbfile:cautious-open-database dbname init-proc 0 "MEMORY" force-init: #t) ;; Journal mode = memory is fastest?
;; (sqlite3:open-database dbname)
)))
- (if on-tmp ;; done in cautious-open-database
- (begin
- (sqlite3:execute db "PRAGMA synchronous = 0;")
- (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))))
+ ;; (if on-tmp ;; done in cautious-open-database
+ ;; (begin
+ ;; (sqlite3:execute db "PRAGMA synchronous = 0;") ;; why was this here when is is handled by cautious-open-database?
+ (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) ;; ))
db))
;; mtest processes registry calls
(define (dbfile:insert-or-update-process nsdb dat)
@@ -580,18 +630,20 @@
host port pid starttime endtime status purpose dbname mtversion))
(define (dbfile:set-process-status nsdb host pid newstatus)
(sqlite3:execute nsdb "UPDATE processes SET status=? WHERE host=? AND pid=?;" newstatus host pid))
+;; as sorted should be stable. can use to choose "winner"
+;;
(define (dbfile:get-process-options nsdb purpose dbname)
(sqlite3:fold-row
;; host port pid starttime status mtversion
(lambda (res . row)
(cons row res))
'()
nsdb
- "SELECT host,port,pid,starttime,endtime,status,mtversion FROM processes WHERE purpose=? AND dbname LIKE ? AND status='alive';"
+ "SELECT host,port,pid,starttime,endtime,status,mtversion FROM processes WHERE purpose=? AND dbname LIKE ? AND status IN ('running','alive') ORDER BY starttime ASC,host,port;"
purpose dbname))
(define (dbfile:get-process-info nsdb host pid)
(let ((res (sqlite3:fold-row
;; host port pid starttime status mtversion
@@ -602,17 +654,25 @@
"SELECT host,port,pid,starttime,endtime,status,purpose,dbname,mtversion FROM processes WHERE host=? AND pid=?;"
host pid)))
(if (null? res)
#f
(car res))))
+
+(define (dbfile:row->procinf row)
+ (match row
+ ((host port pid starttime endtime status mtversion)
+ (make-procinf host: host port: port pid: pid starttime: starttime endtime: endtime status: status mtversion: mtversion))
+ (else
+ (debug:print 0 *default-log-port* "ERROR: row "row" did not match host,port,pid,starttime,endtime,status,mtversion")
+ #f)))
(define (dbfile:set-process-done nsdb host pid reason)
- (sqlite3:execute nsdb "UPDATE processes SET status='ended',endtime=?,reason=? WHERE host=? AND pid=?;" (current-seconds) reason host pid)
+ (sqlite3:execute nsdb "UPDATE processes SET status='done',endtime=?,reason=? WHERE host=? AND pid=?;" (current-seconds) reason host pid)
(dbfile:cleanup-old-entries nsdb))
(define (dbfile:cleanup-old-entries nsdb)
- (sqlite3:execute nsdb "DELETE FROM process WHERE status='ended' AND endtime;" (- (current-seconds) (* 3600 48))))
+ (sqlite3:execute nsdb "DELETE FROM process WHERE status='done' AND endtime;" (- (current-seconds) (* 3600 48))))
;; other no-sync functions
(define (dbfile:with-no-sync-db dbpath proc)
(mutex-lock! *no-sync-db-mutex*)
@@ -675,20 +735,26 @@
;; fails returns (#f lock-creation-time identifier)
;; succeeds (returns (#t lock-creation-time identifier)
;; use (db:no-sync-del! db keyname) to release the lock
;;
(define (db:no-sync-get-lock-with-id db keyname identifier)
+ (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: db: " db " keyname: " keyname " identifier: " identifier)
(sqlite3:with-transaction
db
(lambda ()
(condition-case
(let* ((curr-val (db:no-sync-get/default db keyname #f)))
+ (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: curr-val: " curr-val)
(if curr-val
(match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier
((timestamp . ident)
(cons (equal? ident identifier) timestamp))
- (else (cons #f 'malformed-lock))) ;; lock malformed
+ (else
+ (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: malformed lock")
+ (cons #f 'malformed-lock)
+ )
+ ) ;; lock malformed
(let ((curr-sec (current-seconds))
(lock-value (if identifier
(conc (current-seconds)"+"identifier)
(current-seconds))))
(sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value)
@@ -1572,7 +1638,15 @@
;; (db:hoh-set! stmt-cache db stmt newstmth)
(hash-table-set! stmt-cache stmt newstmth)
newstmth))))
(mutex-unlock! *get-cache-stmth-mutex*)
result))
+
+;; (define *mutex-stmth-call* (make-mutex))
+;;
+;; (define (db:with-mutex-for-stmth proc)
+;; (mutex-lock! *mutex-stmth-call*)
+;; (let* ((res (proc)))
+;; (mutex-unlock! *mutex-stmth-call*)
+;; res))
)
Index: dbmod.scm
==================================================================
--- dbmod.scm
+++ dbmod.scm
@@ -25,28 +25,48 @@
(declare (uses debugprint))
(module dbmod
*
-(import scheme
- chicken
- data-structures
- extras
- files
+(import scheme)
+
+(cond-expand
+ (chicken-4
+ (import chicken
+ data-structures
+ extras
+ files
+
+ posix
+
+ ))
+ (chicken-5
+ (import chicken.base
+ chicken.condition
+ chicken.file
+ chicken.pathname
+ chicken.process
+ chicken.sort
+ chicken.string
+ chicken.time
+
+ )
+ (define file-read-access? file-readable?)
+ (define file-copy copy-file)
+ ))
+(import format
(prefix sqlite3 sqlite3:)
matchable
- posix
typed-records
srfi-1
srfi-18
srfi-69
commonmod
dbfile
- debugprint
- )
+ debugprint)
;; NOTE: This returns only the name "1.db", "main.db", not the path
;;
(define (dbmod:run-id->dbfname run-id)
(conc (dbfile:run-id->dbnum run-id)".db"))
@@ -58,15 +78,12 @@
(not (file-exists? dbdir)))
(create-directory dbdir))
dbdir))
(define (dbmod:run-id->full-dbfname dbstruct run-id)
- (conc (dbmod:get-dbdir dbstruct
-
- run-id
-
- )"/"(dbmod:run-id->dbfname run-id)))
+ (conc (dbmod:get-dbdir dbstruct)
+ "/"(dbmod:run-id->dbfname run-id)))
;;======================================================================
;; Read-only cachedb cached direct from disk method
;;======================================================================
@@ -87,19 +104,19 @@
;; The cachedb one-db file per server method goes in here
;;======================================================================
;; NOTE: the r/w is now w/r, #t=db modified by query, #f=db NOT modified by query
(define (dbmod:with-db dbstruct run-id w/r proc params)
- (let* ((use-mutex (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk
- (> *api-process-request-count* 5)) ;; when writes are happening throttle more
- (> *api-process-request-count* 50)))
+ (let* ((use-mutex w/r) ;; (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk
+ ;; (> *api-process-request-count* 5)) ;; when writes are happening throttle more
+ ;; (> *api-process-request-count* 50)))
(dbdat (dbmod:open-db dbstruct run-id (dbfile:db-init-proc)))
(dbh (dbr:dbdat-dbh dbdat)) ;; this will be the cachedb handle
(dbfile (dbr:dbdat-dbfile dbdat)))
;; if nfs mode do a sync if delta > 2
- (let* ((last-update (dbr:dbstruct-last-update dbstruct))
- (sync-proc (dbr:dbstruct-sync-proc dbstruct))
+ #;(let* ((last-update (dbr:dbstruct-last-update dbstruct))
+ ;; (sync-proc (dbr:dbstruct-sync-proc dbstruct))
(curr-secs (current-seconds)))
(if (> (- curr-secs last-update) 5)
(begin
(sync-proc last-update)
@@ -119,11 +136,11 @@
(loop (- count 1)))
(begin
(debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, giving up.")
(exit 1))))
(exn ()
- (dbfile:print-err exn "ERROR: Unknown error with database for run-id "run-id", message: "
+ (dbfile:print-err exn "ERROR: dbmod:with-db: Unknown error with database for run-id "run-id", message: "
((condition-property-accessor 'exn 'message) exn))
(exit 2))))))
(if use-mutex (mutex-unlock! *db-with-db-mutex*))
res)))
@@ -198,11 +215,11 @@
(let* ((dbstruct (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
(dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
(dbpath (dbmod:get-dbdir dbstruct)) ;; directory where all the .db files are kept
(dbfullname (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
(dbexists (file-exists? dbfullname))
- (tmpdir (dbfile:make-tmpdir-name areapath tmpadj))
+ (tmpdir (common:make-tmpdir-name areapath tmpadj))
(tmpdb (let* ((fname (conc tmpdir"/"dbfname)))
fname))
(cachedb (dbmod:open-cachedb-db init-proc
;; (if (eq? (dbfile:cache-method) 'cachedb)
;; #f
@@ -224,51 +241,22 @@
(dbr:dbstruct-dbtmpname-set! dbstruct tmpdb)
(dbr:dbstruct-dbfname-set! dbstruct dbfname)
(dbr:dbstruct-sync-proc-set! dbstruct
(lambda (last-update)
(if *sync-in-progress*
- (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk")
- (let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log"))
- (sync-cmd (if (eq? syncdir 'todisk)
- (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&")
- (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&")))
- (synclock-file (conc dbfullname".lock"))
- (syncer-running-file (conc dbfullname"-sync-running"))
- (synclock-mod-time (if (file-exists? synclock-file)
- (handle-exceptions
- exn
- #f
- (file-modification-time synclock-file))
- #f))
- (thethread (lambda ()
- (thread-start!
- (make-thread
- (lambda ()
- (set! *sync-in-progress* #t)
- (debug:print-info "Running "sync-cmd)
- (if (file-exists? syncer-running-file)
- (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.")
- (system sync-cmd))
- (set! *sync-in-progress* #f)))))))
- (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk
- (file-modification-time tmpdb)
- (file-modification-time dbfullname))
- (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname)
- (if synclock-mod-time
- (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file
- (begin
- (handle-exceptions
- exn
- #f
- (begin
- (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it")
- (delete-file synclock-file)
- )
- )
- (thethread))
- (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found."))
- (thethread)))))))
+ (debug:print 0 *default-log-port* "WARNING: overlapping calls to sync to disk")
+ (begin
+ ;; turn off writes - send busy or block?
+ ;; call db2db internally
+ ;; turn writes back on
+ ;;
+ (set! *api-halt-writes* #t) ;; do we need a mutex?
+ ;; (dbmod:db-to-db-sync src-db dest-db last-update (dbfile:db-init-proc) keys)
+ (debug:print-info 2 *default-log-port* "Internal sync running from "tmpdb" to "dbfullname)
+ (dbmod:db-to-db-sync tmpdb dbfullname last-update (dbfile:db-init-proc) keys)
+ (set! *api-halt-writes* #f)
+ ))))
;; (dbmod:sync-tables tables #f db cachedb)
;;
(thread-sleep! 1) ;; let things settle before syncing in needed data
(dbmod:sync-gasket tables #f cachedb db dbfullname 'fromdest keys) ;; ) ;; load into cachedb
(dbr:dbstruct-last-update-set! dbstruct (+ (current-seconds) -10)) ;; should this be offset back in time by one second?
@@ -474,47 +462,63 @@
(set! has-last #t)))
dbh
(conc "SELECT name FROM pragma_table_info('"tablename"') as tblInfo;"))
has-last))
+(define (replace-question-marks-with-number str num)
+ (define (replace-helper str index result)
+ (if (>= index (string-length str))
+ result
+ (let ((char (string-ref str index)))
+ (if (char=? char #\?)
+ (replace-helper str (+ index 1) (string-append result (number->string num)))
+ (replace-helper str (+ index 1) (string-append result (string char)))))))
+
+ (replace-helper str 0 ""))
+
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; tbls is ( ("tablename" ( "field1" [#f|proc1] ) ( "field2" [#f|proc2] ) .... ) )
;;
;; direction = fromdest, todisk
;; mode = 'full, 'incr
;;
;; Idea: youngest in dest is last_update time
-;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
(define (dbmod:attach-sync tables dbh destdbfile direction #!key
(mode 'full)
(no-update '("keys")) ;; do
)
- (let* ((num-changes 0)
+ (debug:print-info 2 *default-log-port* "dbmod:attach-sync")
+ (let* ((num-changes 0)
(update-changed (lambda (num-changed table qryname)
(if (> num-changed 0)
(begin
(debug:print-info 0 *default-log-port* "Changed "num-changed" rows for table "table", qry "qryname)
(set! num-changes (+ num-changes num-changed)))))))
- (debug:print 0 *default-log-port* "Doing sync "direction" "destdbfile)
+ (debug:print 2 *default-log-port* "Doing sync "direction" "destdbfile)
(if (not (sqlite3:auto-committing? dbh))
(debug:print 0 *default-log-port* "Skipping sync due to transaction in flight.")
(let* ((table-names (map car tables))
(dest-exists (file-exists? destdbfile)))
(assert dest-exists "FATAL: sync called with non-existant file, "destdbfile)
;; attach the destdbfile
;; for each table
;; insert into dest. select * from src. where last_update>last_update
;; done
- (debug:print 0 *default-log-port* "Attaching "destdbfile" as auxdb")
+ (debug:print 2 *default-log-port* "Attaching "destdbfile" as auxdb")
(handle-exceptions
exn
(begin
(debug:print 0 "ATTACH failed, exiting. exn="(condition->list exn))
(exit 1))
(sqlite3:execute dbh (conc "ATTACH '"destdbfile"' AS auxdb;")))
(for-each
(lambda (table)
- (let* ((tbldat (alist-ref table tables equal?))
+ (let* ((dummy (debug:print 2 *default-log-port* "Doing table " table))
+ (tbldat (alist-ref table tables equal?))
(fields (map car tbldat))
(no-id-fields (filter (lambda (x)(not (equal? x "id"))) fields))
(fields-str (string-intersperse fields ","))
(no-id-fields-str (string-intersperse no-id-fields ","))
(dir (eq? direction 'todisk))
@@ -529,27 +533,38 @@
" SELECT * FROM "fromdb table";"))
(stmt2 (conc "INSERT OR IGNORE INTO "todb table
" SELECT * FROM "fromdb table" WHERE "fromdb table".id=?;"))
(stmt8 (conc "UPDATE "todb table" SET ("no-id-fields-str") = (SELECT "no-id-fields-str" FROM "fromdb table" WHERE "todb table".id="fromdb table".id"
(conc " AND "fromdb table".last_update > "todb table".last_update);")
- ");"))
- (stmt9 (conc "UPDATE "todb table" SET ("no-id-fields-str") = "
+ ");"))
+ (update-string (conc "UPDATE "todb table" SET "))
+ (split-update
+ (let ()
+ (for-each
+ (lambda (column)
+ (set! update-string (conc update-string column" = (SELECT "column" FROM "fromdb table" WHERE "fromdb table".id=?), "))
+ )
+ no-id-fields
+ )
+ ;; drop the last ", "
+ (conc (substring update-string 0 (-(string-length update-string) 2)) " WHERE "todb table".id=? ")
+ )
+ )
+
+
+ (stmt9 (conc "UPDATE "todb table" SET ("no-id-fields-str") = "
"(SELECT "no-id-fields-str" FROM "fromdb table" WHERE "fromdb table".id=?)"
" WHERE "todb table".id=?"))
(newrec (conc "SELECT id FROM "fromdb table" WHERE id NOT IN (SELECT id FROM "todb table");"))
- #;(changedrec (conc "SELECT id FROM "fromdb table" WHERE "fromdb table".last_update > "todb table".last_update AND "
- fromdb table".id="todb table".id;")) ;; main = fromdb
(changedrec (conc "SELECT "fromdb table".id FROM "fromdb table" join "todb table" on "fromdb table".id="todb table".id WHERE "fromdb table".last_update > "todb table".last_update;"))
- ;; SELECT main.tests.id FROM main.tests join auxdb.tests on main.tests.id=auxdb.tests.id WHERE main.tests.last_update > auxdb.tests.last_update;"
(start-ms (current-milliseconds))
(new-ids (sqlite3:fold-row (lambda (res id)(cons id res)) '() dbh newrec)))
- ;; (debug:print 0 *default-log-port* "Got "(length aux-ids)" in aux-ids and "(length main-ids)" in main-ids")
(update-changed (length new-ids) table "new records")
(mutex-lock! *db-transaction-mutex*)
(handle-exceptions
exn
- (debug:print 0 *default-log-port* "Transaction update of "table" failed.")
+ (debug:print 0 *default-log-port* "Transaction update of id fields in "table" failed.")
(sqlite3:with-transaction
dbh
(lambda ()
(for-each (lambda (id)
(sqlite3:execute dbh stmt2 id))
@@ -556,23 +571,40 @@
new-ids))))
(if (member "last_update" fields)
(handle-exceptions
exn
- (debug:print 0 *default-log-port* "Transaction update of "table" failed.")
+ (debug:print 0 *default-log-port* "Transaction update of non id fields in "table" failed.")
(sqlite3:with-transaction
dbh
(lambda ()
- (let* ((changed-ids (sqlite3:fold-row (lambda (res id)(cons id res)) '() dbh changedrec)))
+ (let* ((changed-ids (sqlite3:fold-row (lambda (res id)(cons id res)) '() dbh changedrec))
+ (sql-query "")
+ )
(update-changed (length changed-ids) table "changed records")
(for-each (lambda (id)
- (sqlite3:execute dbh stmt9 id id))
- changed-ids))))))
-
+ (let* ((update-with-ids (replace-question-marks-with-number split-update id))
+ )
+ (debug:print 2 *default-log-port* "about to do sqlite3:execute " dbh " " update-with-ids )
+ (handle-exceptions
+ exn
+ (debug:print 0 *default-log-port* "update from " fromdb table " to " todb table " failed: " ((condition-property-accessor 'exn 'message) exn))
+ (sqlite3:execute dbh update-with-ids)
+ )
+ (debug:print 2 *default-log-port* "after sqlite3:execute")
+ )
+ )
+ changed-ids
+ )
+ )
+ )
+ )
+ )
+ )
(mutex-unlock! *db-transaction-mutex*)
- (debug:print 0 *default-log-port* "Synced table "table
+ (debug:print 2 *default-log-port* "Synced table "table
" in "(- (current-milliseconds) start-ms)"ms")
))
table-names)
(sqlite3:execute dbh "DETACH auxdb;")))
@@ -627,11 +659,11 @@
(debug:print 0 *default-log-port* "stmt3="stmt3)
(if (sqlite3:auto-committing? dbh1)
(begin
(handle-exceptions
exn
- (debug:print 0 *default-log-port* "Transaction update of "table" failed.")
+ (debug:print 0 *default-log-port* "Transaction update of "table" failed. "(condition->list exn))
(sqlite3:with-transaction
dbh1
(lambda ()
(sqlite3:execute dbh1 stmt1) ;; get all new rows
@@ -856,6 +888,94 @@
(res (dbmod:sync-gasket tables last-update sdb ddb dest-db 'todisk keys)))
(sqlite3:finalize! sdb)
(sqlite3:finalize! ddb)
res)))
#f))
+
+;; ======================================================================
+;; dbstats
+;;======================================================================
+
+;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this.
+;; db stats
+(define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total >
+(define *db-stats-mutex* (make-mutex))
+
+(define (dbmod:print-db-stats)
+ (let ((fmtstr "~40a~8-d~20-d~20,2-f")) ;; "~20,2-f"
+ (debug:print 0 *default-log-port* "DB Stats\n========")
+ (debug:print 0 *default-log-port* (format #f "~40a~8a~20a~10a" "Cmd" "Count" "TotTime" "Avg"))
+ (for-each (lambda (cmd)
+ (let* ((dat (hash-table-ref *db-stats* cmd))
+ (count (dbstat-cnt dat))
+ (tottime (dbstat-tottime dat)))
+ (debug:print 0 *default-log-port*
+ (format #f fmtstr cmd count tottime
+ (/ tottime count)))))
+ (sort (hash-table-keys *db-stats*)
+ (lambda (a b)
+ (> (dbstat-tottime (hash-table-ref *db-stats* a))
+ (dbstat-tottime (hash-table-ref *db-stats* b))))))))
+
+(defstruct dbstat
+ (cnt 0)
+ (tottime 0))
+
+(define (db:add-stats cmd run-id params delta)
+ (let* ((modified-cmd (if (eq? cmd 'general-call)
+ (string->symbol (conc "general-call-" (car params)))
+ cmd))
+ (rec (hash-table-ref/default *db-stats* modified-cmd #f)))
+ (if (not rec)
+ (let ((new-rec (make-dbstat)))
+ (hash-table-set! *db-stats* modified-cmd new-rec)
+ (set! rec new-rec)))
+ (dbstat-cnt-set! rec (+ (dbstat-cnt rec) 1))
+ (dbstat-tottime-set! rec (+ (dbstat-tottime rec) delta))))
+
+
+
)
+
+
+;; ATTIC
+
+ #;(let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log"))
+ (sync-cmd (if (eq? syncdir 'todisk)
+ (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&")
+ (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&")))
+ (synclock-file (conc dbfullname".lock"))
+ (syncer-running-file (conc dbfullname"-sync-running"))
+ (synclock-mod-time (if (file-exists? synclock-file)
+ (handle-exceptions
+ exn
+ #f
+ (file-modification-time synclock-file))
+ #f))
+ (thethread (lambda ()
+ (thread-start!
+ (make-thread
+ (lambda ()
+ (set! *sync-in-progress* #t)
+ (debug:print-info "Running "sync-cmd)
+ (if (file-exists? syncer-running-file)
+ (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.")
+ (system sync-cmd))
+ (set! *sync-in-progress* #f)))))))
+ (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk
+ (file-modification-time tmpdb)
+ (file-modification-time dbfullname))
+ (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname)
+ (if synclock-mod-time
+ (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file
+ (begin
+ (handle-exceptions
+ exn
+ #f
+ (begin
+ (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it")
+ (delete-file synclock-file)
+ )
+ )
+ (thethread))
+ (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found."))
+ (thethread))))
Index: dcommon.scm
==================================================================
--- dcommon.scm
+++ dcommon.scm
@@ -59,10 +59,11 @@
update-mutex
updaters
updating
uidat ;; needs to move to tabdat at some time
hide-not-hide-tabs
+ target
)
(define (dboard:commondat-make)
(make-dboard:commondat
curr-tab-num: 0
@@ -70,19 +71,41 @@
please-update: #t
update-mutex: (make-mutex)
updaters: (make-hash-table)
updating: #f
hide-not-hide-tabs: #f
+ target: ""
))
;; RADT => Matrix defstruct addition
(defstruct dboard:graph-dat
((id #f) : string)
((color #f) : vector)
((flag #t) : boolean)
((cell #f) : number)
)
+
+;; RA => returns the tabdat stored at hashkey passed in commondat-tabdats table (e.g. 0 gives summary)
+;;
+(define (dboard:common-get-tabdat commondat #!key (tab-num #f))
+ (let* ((tnum (or tab-num
+ (dboard:commondat-curr-tab-num commondat)
+ 0)) ;; tab-num value is curr-tab-num value in passed commondat
+ (ht (dboard:commondat-tabdats commondat))
+ (res (hash-table-ref/default ht tnum #f)))
+ (or res
+ (let ((new-tabdat (dboard:tabdat-make-data)))
+ (hash-table-set! ht tnum new-tabdat)
+ new-tabdat))))
+
+;; RA => sets the tabdat passed to the hashkey at commondat:tabdats hash table
+;;
+(define (dboard:common-set-tabdat! commondat tabnum tabdat)
+ (hash-table-set!
+ (dboard:commondat-tabdats commondat)
+ tabnum
+ tabdat))
;; data for runs, tests etc. was used in run summary?
;;
(defstruct dboard:runsdat
;; new system
@@ -1138,14 +1161,28 @@
#:readonly "YES"
#:font "Courier New, -12"
)))
(dboard:tabdat-command-tb-set! data tb)
tb)
+
(iup:button "Execute" #:size "50x"
#:action (lambda (obj)
- ;; (let ((cmd (conc ;; "xterm -geometry 180x20 -e \""
- (common:run-a-command (iup:attribute (dboard:tabdat-command-tb data) "VALUE")))))))
+ (let ((cmd (iup:attribute (dboard:tabdat-command-tb data) "VALUE")))
+ (if (substring-index "no-runname-specified" cmd)
+ (debug:print 0 *default-log-port* "ERROR: no runname specified")
+ (begin
+ (if (substring-index "no-target-selected" cmd)
+ (debug:print 0 *default-log-port* "ERROR: no target selected")
+ (begin
+ (if (not (substring-index "-run" cmd))
+ (debug:print 0 *default-log-port* "ERROR: No target selected")
+ (common:run-a-command (iup:attribute (dboard:tabdat-command-tb data) "VALUE"))
+ )
+ )
+ )
+ )
+ )))))))
;; ";echo Press any key to continue;bash -c 'read -n 1 -s'\" &")))
;; (system cmd)))))))
(define (dcommon:command-action-selector commondat tabdat #!key (tab-num #f))
(iup:frame
@@ -1170,11 +1207,10 @@
(let* ((default-run-name (seconds->work-week/day (current-seconds)))
(tb (iup:textbox #:expand "HORIZONTAL"
#:action (lambda (obj val txt)
(debug:catch-and-dump
(lambda ()
- ;; (print "obj: " obj " val: " val " unk: " unk)
(dboard:tabdat-run-name-set! tabdat txt) ;; (iup:attribute obj "VALUE"))
(dashboard:update-run-command tabdat))
"command-runname-selector tb action"))
#:value (or default-run-name (dboard:tabdat-run-name tabdat))))
(lb (iup:listbox #:expand "HORIZONTAL"
@@ -1196,11 +1232,10 @@
(runs-dat (vector-ref runs-for-targ 1))
(run-names (cons default-run-name
(map (lambda (x)
(db:get-value-by-header x runs-header "runname"))
runs-dat))))
- ;; (print "DEBUGINFO: run-names=" run-names)
;; (iup:attribute-set! lb "REMOVEITEM" "ALL")
(iuplistbox-fill-list lb run-names selected-item: default-run-name))))))
;; (dboard:tabdat-updater-for-runs-set! tabdat refresh-runs-list)
(dboard:commondat-add-updater commondat refresh-runs-list tab-num: tab-num)
;; (refresh-runs-list)
Index: docs/manual/Makefile
==================================================================
--- docs/manual/Makefile
+++ docs/manual/Makefile
@@ -37,10 +37,13 @@
# dos2unix megatest_manual.html
megatest_manual.pdf : megatest_manual.txt *.txt *png *.dot
a2x -a toc -f pdf megatest_manual.txt
+%.pdf : %.dot
+ dot -Tpdf $*.dot -o$*.pdf
+
server.ps : server.dot
dot -Tps server.dot > server.ps
client.ps : client.dot
dot -Tps client.dot > client.ps
Index: docs/manual/megatest_manual.pdf
==================================================================
--- docs/manual/megatest_manual.pdf
+++ docs/manual/megatest_manual.pdf
cannot compute difference between binary files
Index: docs/manual/server.dot
==================================================================
--- docs/manual/server.dot
+++ docs/manual/server.dot
@@ -12,67 +12,68 @@
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Megatest. If not, see .
+
digraph G {
- subgraph cluster_1 {
- node [style=filled,shape=box];
-
- check_available_queue -> remove_entries_over_10s_old;
- remove_entries_over_10s_old -> set_available [label="num_avail < 3"];
- remove_entries_over_10s_old -> exit [label="num_avail > 2"];
-
- set_available -> delay_2s;
- delay_2s -> check_place_in_queue;
-
- check_place_in_queue -> "http:transport-launch" [label="at head"];
- check_place_in_queue -> exit [label="not at head"];
-
- "client:login" -> "server:shutdown" [label="login failed"];
- "server:shutdown" -> exit;
-
- subgraph cluster_2 {
- "http:transport-launch" -> "http:transport-run";
- "http:transport-launch" -> "http:transport-keep-running";
-
- "http:transport-keep-running" -> "tests running?";
- "tests running?" -> "client:login" [label=yes];
- "tests running?" -> "server:shutdown" [label=no];
- "client:login" -> delay_5s [label="login ok"];
- delay_5s -> "http:transport-keep-running";
- }
-
- // start_server -> "server_running?";
- // "server_running?" -> set_available [label="no"];
- // "server_running?" -> delay_2s [label="yes"];
- // delay_2s -> "still_running?";
- // "still_running?" -> ping_server [label=yes];
- // "still_running?" -> set_available [label=no];
- // ping_server -> exit [label=alive];
- // ping_server -> remove_server_record [label=dead];
- // remove_server_record -> set_available;
- // set_available -> avail_delay [label="delay 3s"];
- // avail_delay -> "first_in_queue?";
- //
- // "first_in_queue?" -> set_running [label=yes];
- // set_running -> get_next_port -> handle_requests;
- // "first_in_queue?" -> "dead_entry_in_queue?" [label=no];
- // "dead_entry_in_queue?" -> "server_running?" [label=no];
- // "dead_entry_in_queue?" -> "remove_dead_entries" [label=yes];
- // remove_dead_entries -> "server_running?";
- //
- // handle_requests -> start_shutdown [label="no traffic\nno running tests"];
- // handle_requests -> shutdown_request;
- // start_shutdown -> shutdown_delay;
- // shutdown_request -> shutdown_delay;
- // shutdown_delay -> exit;
-
- label = "server:launch";
- color=brown;
- }
-
-// client_start_server -> start_server;
-// handle_requests -> read_write;
-// read_write -> handle_requests;
-}
+ label = "Server Start Sequences";
+ color=brown;
+ rankdir="TB";
+
+ subgraph cluster_1 {
+ label="Find Prime Main Server";
+
+ node [style=filled,shape=box];
+
+ START;
+ HaveServ [label="Look at .servinfo\nfiles for prime main"];
+ AskPrime [label="Ask Prime for main"];
+ PingPrime [label="Ping Prime"];
+ AskPrime [label="Ask .servinfo prime for server"];
+ StartServ [label="Launch Server Process for main.db"];
+
+ START -> HaveServ;
+ HaveServ -> PingPrime;
+ PingPrime -> AskPrime [label="Got response"];
+ PingPrime -> StartServ [label="No reponse"];
+ HaveServ -> StartServ [label="No files"];
+ StartServ -> "Delay 2s" -> START;
+ AskPrime -> DONE;
+ }
+
+ subgraph cluster_2 {
+ label="Starting non-prime server"
+ node [style=filled,shape=box];
+ StartTCPServer [label="Start tcp server"];
+ FindPrimeMain [label="Find Prime Main Server"];
+ RegisterProcessViaPrime [label="Register process via prime server"];
+
+ StartTCPServer -> FindPrimeMain -> START;
+ DONE -> RegisterProcessViaPrime -> READY;
+ }
+
+ subgraph cluster_3 {
+ label="Start Prime Main"
+ node [style=filled,shape=box];
+ StartTCPServer_prime [label="Start tcp server"];
+ GetServInfoFiles [label="Get servinfo files"];
+ CreateServInfoFile [label="Create servinfo file"];
+ RegisterProcess [label="Register process in no-sync (direct access)"];
+ ValidateServInfoFiles [label="Validate servinfo files with ping\nremove any files which do not respond to ping"];
+
+ CheckHost [label="Verify that current host matches\nexisting servinfo files host"]
+ StartTCPServer_prime -> GetServInfoFiles;
+ GetServInfoFiles -> CreateServInfoFile [label="No servinfo\nfiles"];
+ GetServInfoFiles -> ValidateServInfoFiles;
+ ValidateServInfoFiles -> CreateServInfoFile [label="No valid files"];
+ CreateServInfoFile -> GetServInfoFiles [label="servinfo file created"];
+ KeepRunning [label="READY"];
+
+ ValidateServInfoFiles -> CheckHost;
+ CheckHost -> RegisterProcess [label="Have valid\nservinfo files and same host"];
+ RegisterProcess -> KeepRunning;
+ CheckHost -> EXIT [label="Not same host"];
+ }
+}
+
Index: docs/manual/server.png
==================================================================
--- docs/manual/server.png
+++ docs/manual/server.png
cannot compute difference between binary files
Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -236,11 +236,10 @@
(let loop ((minutes (calc-minutes))
(cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(disk-free (get-df (current-directory)))
(last-sync (current-seconds)))
- ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
(let* ((over-time (> (current-seconds) (+ last-sync update-period)))
(new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(delta (abs (- load cpu-load))))
(if (> delta 0.1) ;; don't bother updating with small changes
load
@@ -256,15 +255,16 @@
(do-sync (or new-cpu-load new-disk-free over-time))
(test-info (rmt:get-test-state-status-by-id run-id test-id))
(state (car test-info));; (db:test-get-state test-info))
(status (cdr test-info));; (db:test-get-status test-info))
+ (killreq (equal? state "KILLREQ"))
(kill-reason "no kill reason specified")
(kill-job? #f))
;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
- ((test-get-kill-request run-id test-id)
+ (killreq
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
@@ -276,16 +276,11 @@
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
(launch:handle-zombie-tests run-id))
(when do-sync
- ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
- ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
- ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
- (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
- ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
- )
+ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
(if kill-job?
(begin
(debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
(mutex-lock! m)
@@ -331,20 +326,64 @@
)))
(mutex-unlock! m)
;; no point in sticking around. Exit now. But run end of run before exiting?
(launch:end-of-run-check run-id)
(exit)))
- (if (hash-table-ref/default misc-flags 'keep-going #f)
+ (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(begin
- (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
- (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
- (loop (calc-minutes)
- (or new-cpu-load cpu-load)
- (or new-disk-free disk-free)
- (if do-sync (current-seconds) last-sync)))))))
- (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
-
+ (thread-sleep! 6) ;; was 3
+ (loop (calc-minutes)
+ (or new-cpu-load cpu-load)
+ (or new-disk-free disk-free)
+ (if do-sync (current-seconds) last-sync))))))
+ (tests:update-central-meta-info run-id test-id (commonmod:get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+
+
+;; read testconfig and create .logpro and script files
+;; - use #f for tconfigreg to re-read the testconfigs from disk
+;;
+(define (launch:extract-scripts-logpro test-dir test-name item-path tconfigreg-in)
+ (let* ((tconfigreg (or tconfigreg-in
+ (tests:get-all)))
+ (tconfig-fname (conc test-dir "/.testconfig"))
+ (tconfig-tmpfile (conc tconfig-fname ".tmp"))
+ (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
+ (scripts (configf:get-section tconfig "scripts"))
+ (logpros (configf:get-section tconfig "logpro")))
+ ;; create .testconfig file
+ (configf:write-alist tconfig tconfig-tmpfile)
+ (file-move tconfig-tmpfile tconfig-fname #t)
+ (delete-file* ".final-status")
+
+ ;; extract scripts from testconfig and write them to files in test run dir
+ (for-each
+ (lambda (scriptdat)
+ (match scriptdat
+ ((name content)
+ (debug:print-info 2 *default-log-port* "Creating script "(current-directory)"/"name)
+ (with-output-to-file name
+ (lambda ()
+ (print content)))
+ (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))
+ (else
+ (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
+ scripts)
+
+ ;; extract logpro from testconfig and write them to files in test run dir
+ (for-each
+ (lambda (logprodat)
+ (match logprodat
+ ((name content)
+ (debug:print-info 2 *default-log-port* "Creating logpro file "(current-directory)"/"name".logpro")
+ (with-output-to-file (conc name".logpro")
+ (lambda ()
+ (print content)
+ ;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))
+ )))
+ (else
+ (debug:print-info 0 "Invalid logpro definiton found in [logpro] section of testconfig. \"" logprodat "\""))))
+ logpros)))
(define (launch:execute encoded-cmd)
(let* ((cmdinfo (common:read-encoded-string encoded-cmd))
(tconfigreg #f))
(setenv "MT_CMDINFO" encoded-cmd)
@@ -616,12 +655,11 @@
(if mt-bindir-path (setenv "PATH" (conc tmppath":"mt-bindir-path))))
;;(bb-check-path msg: "launch:execute post block 4")
;; (change-directory top-path)
;; Can setup as client for server mode now
;; (client:setup)
-
-
+
;; environment overrides are done *before* the remaining critical envars.
(alist->env-vars env-ovrd)
;;(bb-check-path msg: "launch:execute post block 41")
(runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals)
;;(bb-check-path msg: "launch:execute post block 42")
@@ -647,37 +685,39 @@
(set! fullrunscript "xterm")
(if (and fullrunscript
(common:file-exists? fullrunscript)
(not (file-execute-access? fullrunscript)))
(system (conc "chmod ug+x " fullrunscript))))
-
- ;; We are about to actually kick off the test
- ;; so this is a good place to remove the records for
- ;; any previous runs
- ;; (db:test-remove-steps db run-id testname itemdat)
- ;; now is also a good time to write the .testconfig file
- (let* ((tconfig-fname (conc work-area "/.testconfig"))
- (tconfig-tmpfile (conc tconfig-fname ".tmp"))
- (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
- (scripts (configf:get-section tconfig "scripts")))
- ;; create .testconfig file
- (configf:write-alist tconfig tconfig-tmpfile)
- (file-move tconfig-tmpfile tconfig-fname #t)
- (delete-file* ".final-status")
-
- ;; extract scripts from testconfig and write them to files in test run dir
- (for-each
- (lambda (scriptdat)
- (match scriptdat
- ((name content)
- (with-output-to-file name
- (lambda ()
- (print content)
- (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))))
- (else
- (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
- scripts))
+ (launch:extract-scripts-logpro work-area test-name item-path tconfigreg)
+
+;;;;; ;; We are about to actually kick off the test
+;;;;; ;; so this is a good place to remove the records for
+;;;;; ;; any previous runs
+;;;;; ;; (db:test-remove-steps db run-id testname itemdat)
+;;;;; ;; now is also a good time to write the .testconfig file
+;;;;; (let* ((tconfig-fname (conc work-area "/.testconfig"))
+;;;;; (tconfig-tmpfile (conc tconfig-fname ".tmp"))
+;;;;; (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs)))
+;;;;; (scripts (configf:get-section tconfig "scripts"))
+;;;;; (precmd (configf:lookup tconfig )
+;;;;; ;; create .testconfig file
+;;;;; (configf:write-alist tconfig tconfig-tmpfile)
+;;;;; (file-move tconfig-tmpfile tconfig-fname #t)
+;;;;; (delete-file* ".final-status")
+;;;;;
+;;;;; ;; extract scripts from testconfig and write them to files in test run dir
+;;;;; (for-each
+;;;;; (lambda (scriptdat)
+;;;;; (match scriptdat
+;;;;; ((name content)
+;;;;; (with-output-to-file name
+;;;;; (lambda ()
+;;;;; (print content)
+;;;;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)))))
+;;;;; (else
+;;;;; (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\""))))
+;;;;; scripts))
;;
(let* ((m (make-mutex))
(kill-job? #f)
(exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status
@@ -694,11 +734,15 @@
(th2 (make-thread runit "run job"))
(tconfig (tests:get-testconfig test-name item-path tconfigreg #t))
(propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code"))
(propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED"))
(test-status "not set")
- )
+ (precmd (configf:lookup tconfig "setup" "precmd"))
+ (postcmd (configf:lookup tconfig "setup" "postcmd")))
+ ;; first, if set, run the precmd
+ (if precmd ;; (file-exists? precmd)(file-execute-access? precmd))
+ (system precmd)) ;; up to test author to put nbfake if desired.
(set! job-thread th2)
(thread-start! th1)
(thread-start! th2)
(thread-join! th2)
(debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...")
@@ -762,10 +806,13 @@
(set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id)))
;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1.
+ (if postcmd
+ (system postcmd))
+
(if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list))
(begin
(debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status)
(set! *globalexitstatus* 1)
)
Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
;; (declare (unit megatest-version))
-(define megatest-version 1.8017)
+(define megatest-version 1.8028)
Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -55,10 +55,12 @@
(declare (uses dbmod.import))
(declare (uses portlogger))
(declare (uses portlogger.import))
(declare (uses tcp-transportmod))
(declare (uses tcp-transportmod.import))
+(declare (uses apimod))
+(declare (uses apimod.import))
(declare (uses rmtmod))
(declare (uses rmtmod.import))
;; (declare (uses debugprint))
;; (declare (uses debugprint.import))
@@ -72,10 +74,11 @@
commonmod
dbfile
portlogger
tcp-transportmod
rmtmod
+ apimod
)
(define *db* #f) ;; this is only for the repl, do not use in general!!!!
(include "common_records.scm")
@@ -257,11 +260,13 @@
-debug N|N,M,O... : enable debug 0-N or N and M and O ...
-debug-noprop N|M,M,O...: enable debug but do not propagate to subprocesses via MT_DEBUG
-config fname : override the megatest.config file with fname
-append-config fname : append fname to the megatest.config file
-import-sexpr fname : import a sexpr file (use -list-runs % -dumpmode sexpr to create)
-
+ -remove-dbs all : remove Megatest DBs before importing sexpr. (Use only with -import-sexpr)
+ -regen-testfiles : regenerate scripts and logpro files from testconfig, run in test context
+
Utilities
-env2file fname : write the environment to fname.csh and fname.sh
-envcap a : save current variables labeled as context 'a' in file envdat.db
-envdelta a-b : output enviroment delta from context a to context b to -o fname
set the output mode with -dumpmode csh, bash or ini
@@ -378,10 +383,11 @@
"-envcap"
"-envdelta"
"-setvars"
"-set-state-status"
"-import-sexpr"
+ "-remove-dbs" ;; to be used only with -import-sexpr to remove megatest dbs first.
"-period" ;; sync period in seconds
"-timeout" ;; exit sync if timeout in seconds exceeded since last change
;; move runs stuff here
"-remove-keep"
@@ -466,10 +472,11 @@
"-local" ;; run some commands using local db access
"-generate-html"
"-generate-html-structure"
"-list-run-time"
"-list-test-time"
+ "-regen-testfiles"
;; misc queries
"-list-disks"
"-list-targets"
"-list-db-targets"
@@ -968,12 +975,14 @@
(tl (launch:setup))
(keys (keys:config-get-fields *configdat*)))
(case (rmt:transport-mode)
((tcp)
(let* ((timeout (server:expiration-timeout)))
- (debug:print 0 *default-log-port* "INFO: Running using tcp method with server timeout of "timeout)
+ (debug:print 0 *default-log-port* "INFO: megatest -server starting on " (get-host-name) " for " dbfname " using tcp method with timeout of "timeout)
(tt-server-timeout-param timeout)
+ (api:queue-processor)
+ (thread-start! (make-thread api:print-db-stats "print-db-stats"))
(if dbfname
(tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)
(begin
(debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.")
(exit 1)))))
@@ -1044,16 +1053,16 @@
(if (args:get-arg "-kill-servers")
-
+
(let* ((tl (launch:setup)) ;; need this to initialize *toppath*
(servdir (tt:get-servinfo-dir *toppath*))
(servfiles (glob (conc servdir "/*:*.db")))
(fmtstr "~10a~22a~10a~25a~25a~8a\n")
- (dbfiles (append (glob (conc *toppath* "/.mtdb/main.db")) (glob (conc *toppath* "/.mtdb/?.db"))(glob (conc *toppath* "/.mtdb/??.db"))))
+ (dbfiles (if (file-exists? (conc *toppath* "/.mtdb/main.db")) (append (glob (conc *toppath* "/.mtdb/main.db")) (glob (conc *toppath* "/.mtdb/?.db"))(glob (conc *toppath* "/.mtdb/??.db"))) '()))
(ttdat (make-tt areapath: *toppath*))
)
(format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state")
(for-each
(lambda (dbfile)
@@ -1091,10 +1100,14 @@
sfiles
)
)
)
dbfiles
+ )
+ ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id.
+ (if (file-exists? (conc *toppath* "/.mtdb/no-sync.db"))
+ (delete-file (conc *toppath* "/.mtdb/no-sync.db"))
)
(set! *didsomething* #t)
(exit)
)
)
@@ -2115,10 +2128,25 @@
(paths (tests:test-get-paths-matching keys target (args:get-arg "-test-files"))))
(for-each (lambda (path)
(print path))
paths))))))
+;;======================================================================
+;; Utils for test areas
+;;======================================================================
+
+(if (args:get-arg "-regen-testfiles")
+ (if (getenv "MT_TEST_RUN_DIR")
+ (begin
+ (launch:setup)
+ (change-directory (getenv "MT_TEST_RUN_DIR"))
+ (let* ((testname (getenv "MT_TEST_NAME"))
+ (itempath (getenv "MT_ITEMPATH")))
+ (launch:extract-scripts-logpro (getenv "MT_TEST_RUN_DIR") testname itempath #f))
+ (set! *didsomething* #t))
+ (debug:print 0 *default-log-port* "ERROR: Must run -regen-testfiles in a test environment (i.e. test xterm from dashboard)")))
+
;;======================================================================
;; Archive tests
;;======================================================================
;; Archive tests matching target, runname, and testpatt
(if (equal? (args:get-arg "-archive") "replicate-db")
@@ -2132,13 +2160,13 @@
(exit 1)))
(if (common:file-exists? (conc *toppath* "/megatest.db"))
(begin
(debug:print-info 1 *default-log-port* "File " (conc *toppath* "/megatest.db") " already exists. Please remove it before trying to replicate db")
(exit 1)))
- (if (and (common:get-db-tmp-area) (> (length (directory (common:get-db-tmp-area) #f)) 0))
+ (if (and (common:make-tmpdir-name *toppath* "") (> (length (directory (common:make-tmpdir-name *toppath* "") #f)) 0))
(begin
- (debug:print-info 1 *default-log-port* (common:get-db-tmp-area) " not empty. Please remove it before trying to replicate db")
+ (debug:print-info 1 *default-log-port* (common:make-tmpdir-name *toppath* "") " not empty. Please remove it before trying to replicate db")
(exit 1)))
;; check if timestamp
(let* ((source (args:get-arg "-source"))
(src (if (not (equal? (substring source 0 1) "/"))
(conc (current-directory) "/" source)
@@ -2429,11 +2457,11 @@
(begin
(debug:print 0 *default-log-port* "Failed to setup, exiting")
(exit 1)))
;; keep this one local
;; (open-run-close patch-db #f)
- (let ((dbstructs (db:setup #f)))
+ (let ((dbstructs (db:setup)))
(common:cleanup-db dbstructs full: #t))
(set! *didsomething* #t)))
(if (args:get-arg "-cleanup-db")
(begin
@@ -2445,11 +2473,11 @@
;; (if (not (server:choose-server *toppath* 'home?))
;; (begin
;; (debug:print 0 *default-log-port* "Servers are not running on this host or no servers alive. Cannot run cleanup-db")
;; (exit 1)))
- (let ((dbstructs (db:setup #f)))
+ (let ((dbstructs (db:setup)))
(common:cleanup-db dbstructs))
(set! *didsomething* #t)))
(if (args:get-arg "-mark-incompletes")
(begin
@@ -2506,11 +2534,11 @@
(let* ((toppath (launch:setup))
(dbstructs (if (and toppath
;; NOTE: server:choose-server is starting a server
;; either add equivalent for tcp mode or ????
#;(server:choose-server toppath 'home?))
- (db:setup #t)
+ (db:setup)
#f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f)))
(if *toppath*
(cond
((getenv "MT_RUNSCRIPT")
;; How to run megatest scripts
@@ -2597,27 +2625,44 @@
(if (args:get-arg "-import-megatest.db")
(begin
(launch:setup)
(db:multi-db-sync
- (db:setup #f)
+ (db:setup)
'killservers
'dejunk
'adj-testids
'old2new
)
(set! *didsomething* #t)))
(if (args:get-arg "-import-sexpr")
- (begin
- (launch:setup)
- (rmt:import-sexpr (args:get-arg "-import-sexpr"))
- (set! *didsomething* #t)))
+ (let*(
+ (toppath (launch:setup))
+ (tmppath (common:make-tmpdir-name toppath "")))
+ (if (file-exists? (conc toppath "/.mtdb"))
+ (if (args:get-arg "-remove-dbs")
+ (let* ((dbfiles (conc toppath "/.mtdb/* " tmppath "/*")))
+ (debug:print 0 *default-log-port* "Removing db files: " dbfiles)
+ (system (conc "rm -rvf " dbfiles))
+ )
+ (begin
+ (debug:print 0 *default-log-port* "ERROR: Cannot import sexpr with an existing DB present.")
+ (debug:print 0 *default-log-port* "Add '-remove-dbs all' to remove the current Megatest DBs.")
+ (set! *didsomething* #t)
+ (exit)
+ )
+ )
+ (debug:print 0 *default-log-port* "Did not find " (conc toppath "/.mtdb"))
+ )
+ (db:setup)
+ (rmt:import-sexpr (args:get-arg "-import-sexpr"))
+ (set! *didsomething* #t)))
(if (args:get-arg "-sync-to-megatest.db")
(let* ((duh (launch:setup))
- (dbstruct (db:setup #t))
+ (dbstruct (db:setup))
(tmpdbpth (dbr:dbstruct-tmppath dbstruct))
(lockfile (conc tmpdbpth ".lock"))
(locked (common:simple-file-lock lockfile))
(res (if locked
(db:multi-db-sync
@@ -2633,10 +2678,11 @@
(if (args:get-arg "-sync-to")
(let ((toppath (launch:setup)))
(tasks:sync-to-postgres *configdat* (args:get-arg "-sync-to"))
(set! *didsomething* #t)))
+
;; use with -from and -to
;;
(if (args:get-arg "-db2db")
(let* ((duh (launch:setup))
@@ -2650,18 +2696,19 @@
(sync-timeout (if sync-timeout-in (string->number sync-timeout-in) #f))
(lockfile (conc dest-db".sync-lock"))
(keys (db:get-keys #f))
(thesync (lambda (last-update)
(debug:print-info 0 *default-log-port* "Attempting to sync data from "src-db" to "dest-db"...")
+ (debug:print-info 0 *default-log-port* "PID = " (current-process-id))
(if (not (file-exists? dest-db))
(begin
(debug:print 0 *default-log-port* "Using copy to create "dest-db" from "src-db)
(file-copy src-db dest-db)
1)
(let ((res (dbmod:db-to-db-sync src-db dest-db last-update (dbfile:db-init-proc) keys)))
(if res
- (debug:print-info 0 *default-log-port* "Synced " res " records from "src-db" to "dest-db)
+ (debug:print-info 2 *default-log-port* "Synced " res " records from "src-db" to "dest-db)
(debug:print-info 0 *default-log-port* "No sync due to permissions or other issue."))
res))))
(start-time (current-seconds))
(synclock-mod-time (if (file-exists? lockfile)
(handle-exceptions
@@ -2673,11 +2720,17 @@
)
(if (and src-db dest-db)
(if (file-exists? src-db)
(if (and (file-exists? lockfile) (< age 20))
(debug:print 0 *default-log-port* "Lock "lockfile" exists, skipping sync...")
- (begin
+ (begin
+ (if (file-exists? lockfile)
+ (begin
+ (debug:print 0 *default-log-port* "Deleting old lock file " lockfile)
+ (delete-file lockfile)
+ )
+ )
(dbfile:with-simple-file-lock
lockfile
(lambda ()
(let loop ((last-changed (current-seconds))
(last-update 0))
@@ -2694,11 +2747,11 @@
(> sync-timeout (- now-time last-changed)))
(begin
(if sync-period (thread-sleep! sync-period))
(loop (if (> changes 0) now-time last-changed) now-time))))))))
(debug:print 0 *default-log-port* "Releasing lock file " lockfile)
- )
+ )
)
(debug:print 0 *default-log-port* "No sync due to unreadble or non-existant source file"src-db))
(debug:print 0 *default-log-port* "Usage for -db2db; -to and -from must be specified"))
(set! *didsomething* #t)))
Index: mt.scm
==================================================================
--- mt.scm
+++ mt.scm
@@ -15,11 +15,13 @@
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see .
;;
-(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking (srfi 18) posix-extras directory-utils call-with-environment-variables)
+(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking (srfi 18) posix-extras directory-utils
+ call-with-environment-variables)
+
(import (prefix sqlite3 sqlite3:))
(declare (unit mt))
(declare (uses debugprint))
(declare (uses db))
Index: portlogger.scm
==================================================================
--- portlogger.scm
+++ portlogger.scm
@@ -23,15 +23,47 @@
(declare (uses dbmod))
(module portlogger
*
-(import scheme chicken data-structures)
-(import srfi-1 posix srfi-69 hostinfo dot-locking z3
- (srfi 18) extras s11n)
+(import scheme)
+
+(cond-expand
+ (chicken-4
+ (import chicken data-structures)
+ (import posix
+ ;; hostinfo
+ ;; dot-locking
+ extras
+ )
+
+ (import (prefix sqlite3 sqlite3:))
+ (import debugprint dbmod)
+ )
+ (chicken-5
+ (import chicken.base
+ chicken.condition
+ chicken.file
+ chicken.pathname
+ chicken.process-context.posix
+ chicken.process
+ chicken.sort
+ chicken.string
+ chicken.time
+ chicken.random
+
+ system-information
+ )
+ (define file-write-access? file-writable?)
+ (define random pseudo-random-integer)
+ ))
+
+(import srfi-1 srfi-69 z3
+ (srfi 18) s11n)
(import (prefix sqlite3 sqlite3:))
(import debugprint dbmod)
+
;; lsof -i
(define (portlogger:open-db fname)
(let* ((avail (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away
(exists (file-exists? fname))
Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -67,83 +67,73 @@
(else #f)))
;;======================================================================
(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
-
-;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
-;;
-(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
- (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
-
- (if (not (eq? (rmt:transport-mode) 'nfs))
- (begin
- (if (> attemptnum 2)
- (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
-
- (cond
- ((> attemptnum 2) (thread-sleep! 0.05))
- ((> attemptnum 10) (thread-sleep! 0.5))
- ((> attemptnum 20) (thread-sleep! 1)))
-
- ;; I'm turning this off, it may make sense to move it
- ;; into http-transport-handler
- (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))
- (begin
- (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.")
- (case (rmt:transport-mode)
- ((http)
- (server:run *toppath*)
- (thread-sleep! 3))
- (else
- (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server
- ))))))
-
- ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
- ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
- ;; 3. do the query, if on homehost use local access
- ;;
- (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
- (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
- (runremote (or area-dat
- *runremote*))
- (attemptnum (+ 1 attemptnum))
- (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))
- (testsuite (common:get-testsuite-name))
- (mtexe (common:find-local-megatest)))
-
- (case (rmt:transport-mode)
- ((http)(http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode))
- ((tcp) (tcp-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
- ((nfs) (nfs-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
- )))
-
-(define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
- (let* ((keys (common:get-fields *configdat*))
- (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard")))
- (api:dispatch-request dbstruct cmd run-id params)))
-
-(define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
- (if (not runremote)
- (let* ((newremote (make-and-init-remote areapath)))
- (set! *runremote* newremote)
- (set! runremote newremote)))
- (let* ((dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) ;;(dbfile:run-id->path areapath run-id)))
- (tt:handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))
-
-(define (rmt:print-db-stats)
- (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
- (debug:print 18 *default-log-port* "DB Stats\n========")
- (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
- (for-each (lambda (cmd)
- (let ((cmd-dat (hash-table-ref *db-stats* cmd)))
- (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0))))))
- (sort (hash-table-keys *db-stats*)
- (lambda (a b)
- (> (vector-ref (hash-table-ref *db-stats* a) 0)
- (vector-ref (hash-table-ref *db-stats* b) 0)))))))
-
+(define *ttdat* #f)
+;; how to make area-dat
+(define (rmt:set-ttdat areapath ttdat)
+ (if ttdat
+ ttdat
+ (if *ttdat*
+ *ttdat*
+ (begin
+ (debug:print-info 2 *default-log-port* "rmt:set-ttdat: Initialize new ttdat")
+ (let* ((newremote (make-and-init-remote areapath)))
+ (set! *ttdat* newremote)
+ newremote
+ )
+ )
+ )
+ )
+)
+
+;; NB// area-dat replaced by ttdat
+;;
+(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f))
+ (assert (or (not run-id) (number? run-id)) "FATAL: run-id is required to be a number or #f")
+ (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
+ (let* ((areapath *toppath*) ;; TODO - resolve from dbstruct to be compatible with multiple areas
+ (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*))
+ (testsuite (common:get-testsuite-name)))
+ (case (rmt:transport-mode)
+ ((tcp)
+ (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
+ (attemptnum (+ 1 attemptnum))
+ (mtexe (common:find-local-megatest))
+ (dbfname (conc (dbfile:run-id->dbnum run-id)".db"))
+ (ttdat (rmt:set-ttdat areapath ttdat))
+ (conn (tt:get-conn ttdat dbfname))
+ (is-main (equal? dbfname "main.db")) ;; why not (not run-id) ?
+ (server-start-proc (if is-main
+ #f
+ (lambda ()
+ ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname)
+ (rmt:start-server ;; tt:server-process-run
+ areapath
+ testsuite ;; (dbfile:testsuite-name)
+ mtexe
+ run-id)))))
+ ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it
+ ;; and if there is no conn we first send a request to the main.db server to start a
+ ;; server for the dbfname.
+ #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request
+ (begin
+ (server-start-proc)
+ (thread-sleep! 1)))
+ (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)))
+ ((nfs)
+ (nfs-transport-handler cmd run-id params attemptnum areapath readonly-mode testsuite))
+ (else
+ (debug:print-info 0 *default-log-port* "rmt:transport-mode is "(rmt:transport-mode))
+ (assert #f "FATAL: rmt:transport-mode set to invalid value.")))))
+
+(define (nfs-transport-handler cmd run-id params attemptnum areapath readonly-mode testsuite)
+ (let* ((keys (common:get-fields *configdat*))
+ (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath)))
+ (api:dispatch-request dbstruct cmd run-id params)))
+
(define (rmt:get-max-query-average run-id)
(mutex-lock! *db-stats-mutex*)
(let* ((runkey (conc "run-id=" run-id " "))
(cmds (filter (lambda (x)
(substring-index runkey x))
@@ -167,12 +157,12 @@
(mutex-unlock! *db-stats-mutex*)
res))
(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
(let* ((qry-is-write (not (member cmd api:read-only-queries)))
- (db-file-path (db:dbfile-path)) ;; 0))
- (dbstructs-local (db:setup #t))
+ (db-file-path (common:make-tmpdir-name *toppath* "")) ;; 0))
+ (dbstructs-local (db:setup))
(read-only (not (file-write-access? db-file-path)))
(start (current-milliseconds))
(resdat (if (not (and read-only qry-is-write))
(let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params))))
;; (handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong..
@@ -204,11 +194,11 @@
;; (rmt:update-db-stats run-id cmd params duration)
;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
(if qry-is-write
(let ((start-time (current-seconds)))
(mutex-lock! *db-multi-sync-mutex*)
-/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client)
+ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client)
(mutex-unlock! *db-multi-sync-mutex*)))))
res))
;;======================================================================
;;
@@ -221,12 +211,12 @@
;;======================================================================
(define (rmt:kill-server run-id)
(rmt:send-receive 'kill-server run-id (list run-id)))
-(define (rmt:start-server run-id)
- (rmt:send-receive 'start-server 0 (list run-id)))
+(define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server
+ (rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id)))
;;======================================================================
;; M I S C
;;======================================================================
@@ -235,16 +225,16 @@
;; This login does no retries under the hood - it acts a bit like a ping.
;; Deprecated for nmsg-transport.
;;
;; (define (rmt:login-no-auto-client-setup runremote)
-;; (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature))))
+;; (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature))))
;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host
(define (rmt:get-latest-host-load hostname)
- (rmt:send-receive 'get-latest-host-load 0 (list hostname)))
+ (rmt:send-receive 'get-latest-host-load #f (list hostname)))
(define (rmt:sdb-qry qry val run-id)
;; add caching if qry is 'getid or 'getstr
(rmt:send-receive 'sdb-qry run-id (list qry val)))
@@ -502,12 +492,12 @@
(rmt:send-receive 'delete-run #f (list run-id)))
(define (rmt:update-run-stats run-id stats)
(rmt:send-receive 'update-run-stats #f (list run-id stats)))
-(define (rmt:delete-old-deleted-test-records)
- (rmt:send-receive 'delete-old-deleted-test-records #f '()))
+(define (rmt:delete-old-deleted-test-records run-id)
+ (rmt:send-receive 'delete-old-deleted-test-records run-id (list run-id)))
(define (rmt:get-runs runpatt count offset keypatts)
(rmt:send-receive 'get-runs #f (list runpatt count offset keypatts)))
(define (rmt:simple-get-runs runpatt count offset target last-update)
Index: rmtmod.scm
==================================================================
--- rmtmod.scm
+++ rmtmod.scm
@@ -86,33 +86,41 @@
(define (rmt:import-run target run-dat)
(let* ((runname (car run-dat))
(all-dat (cdr run-dat))
(tests-data (alist-ref "data" all-dat equal?))
(run-meta (alist-ref "meta" all-dat equal?))
- (run-id (rmt:insert-run target runname run-meta)))
+ (run-id (string->number (alist-ref "id" run-meta equal?))))
+
+ (rmt:insert-run run-id target runname run-meta)
(for-each
(lambda (test-dat)
(let* ((test-id (car test-dat))
(test-rec (cdr test-dat)))
(rmt:insert-test run-id test-rec)))
tests-data)))
;; insert run if not there, return id either way
-(define (rmt:insert-run target runname run-meta)
+(define (rmt:insert-run run-id target runname run-meta)
;; look for id, return if found
(debug:print 0 *default-log-port* "Insert run: "target"/"runname)
(let* ((runs (rmtmod:send-receive 'simple-get-runs #f
;; runpatt count offset target last-update)
(list runname #f #f target #f))))
(if (null? runs)
- (rmtmod:send-receive 'insert-run #f (list target runname run-meta))
- (simple-run-id (car runs)))))
+ (begin
+ (debug:print 0 *default-log-port* "inserting run for runname " runname " target " target)
+ (rmtmod:send-receive 'insert-run #f (list run-id target runname run-meta))
+ )
+ (begin
+ (debug:print 0 *default-log-port* "Found run-id " (simple-run-id (car runs)) " for runname " runname " target " target)
+ (simple-run-id (car runs)
+ )
+ ))))
(define (rmt:insert-test run-id test-rec)
(let* ((testname (alist-ref "testname" test-rec equal?))
(item-path (alist-ref "item_path" test-rec equal?)))
- (debug:print 0 *default-log-port* " Insert test in run "run-id": "testname"/"item-path)
(rmtmod:send-receive 'insert-test run-id test-rec)))
;;======================================================================
;; T E S T S
;;======================================================================
@@ -192,21 +200,28 @@
(define (rmt:get-toplevels-and-incompletes run-id running-deadtime remotehoststart-deadtime)
(rmtmod:send-receive 'get-toplevels-and-incompletes run-id (list run-id running-deadtime remotehoststart-deadtime)))
+;; .final-status file is two lines:
+;; "state"
+;; "status"
+;;
(define (rmt:get-status-from-final-status-file run-dir)
(let ((infile (conc run-dir "/.final-status")))
- ;; first verify we are able to write the output file
+ ;; first verify we are able to read the output file
(if (not (file-read-access? infile))
(begin
(debug:print 2 *default-log-port* "ERROR: cannot read " infile)
(debug:print 2 *default-log-port* "ERROR: run-dir is " run-dir)
#f
)
- (with-input-from-file infile read-lines)
- )))
+ (let ((res (with-input-from-file infile read-lines)))
+ (if (null? res)
+ #f
+ res))))) ;; (string-split (car res))))))) <== I would have preferred a single line STATE STATUS without "'s
+ ;; (string-split (car res))))))) ;; DUNNO WHICH IS CORRECT
;; select end_time-now from
;; (select testname,item_path,event_time+run_duration as
;; end_time,strftime('%s','now') as now from tests where state in
;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -346,11 +346,11 @@
(args:get-arg "-one-pass"))
(exit 0))
(if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
- (let* ((num-running (rmt:get-count-tests-running run-id))
+ (let* ((num-running (rmt:get-count-tests-running-for-run-id run-id))
(num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
(job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
(if (string? jobg-count)
(string->number jobg-count)
jobg-count))))
@@ -2064,11 +2064,11 @@
;;
;; There is now a single call to runs:update-all-test_meta and this
;; per-test call is not needed. Given the delicacy of the move to
;; v1.55 this code is being left in place for the time being.
;;
- (if (not (hash-table-ref/default *test-meta-updated* test-name #f))
+ (if (not (hash-table-exists? *test-meta-updated* test-name))
(begin
(hash-table-set! *test-meta-updated* test-name #t)
(runs:update-test_meta test-name test-conf)))
;; itemdat => ((ripeness "overripe") (temperature "cool") (season "summer"))
@@ -2434,10 +2434,12 @@
((kill-runs)
(tasks:kill-runner target run-name "%")
(debug:print 1 *default-log-port* "Killing tests for run: " runkey " " (db:get-value-by-header run header "runname"))
)
((remove-runs)
+ ;; use this location to cleanup old DELETED records? No. See below for same call
+ ;; (rmt:delete-old-deleted-test-records run-id)
;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10))
;; seek and kill in flight -runtests with % as testpatt here
;; (if (equal? testpatt "%")
(tasks:kill-runner target run-name testpatt)
;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt))
@@ -2724,11 +2726,11 @@
(debug:print 1 *default-log-port* "Removing target " target "run: " run-name)
(if (not keep-records)
(begin
(debug:print 1 *default-log-port* "Removing DB records for the run.")
(rmt:delete-run run-id)
- (rmt:delete-old-deleted-test-records))
+ (rmt:delete-old-deleted-test-records run-id))
)
(if (not (equal? linkspath "/does/not/exist/I"))
(begin
(debug:print 1 *default-log-port* "Recursively removing links dir " linkspath)
(runs:recursive-delete-with-error-msg linkspath)))
Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -728,11 +728,11 @@
;; #t
;; #f)))
;; timeout is hms string: 1h 5m 3s, default is 1 minute
;; This is currently broken. Just use the number of hours with no unit.
-;; Default is 60 seconds.
+;; Default is 600 seconds.
;;
(define (server:expiration-timeout)
(let* ((tmo (configf:lookup *configdat* "server" "timeout")))
(if (string? tmo)
(let* ((num (string->number tmo)))
Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -84,11 +84,11 @@
(tasks:open-db numretries (- numretries 1)))
(begin
(print-call-chain (current-error-port))
(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
(debug:print 5 *default-log-port* " exn=" (condition->list exn))))
- (let* ((dbpath (db:dbfile-path )) ;; (tasks:get-task-db-path))
+ (let* ((dbpath (common:make-tmpdir-name *toppath* "")) ;; (tasks:get-task-db-path))
(dbfile (conc dbpath "/monitor.db"))
(avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
(exists (common:file-exists? dbpath))
(write-access (file-write-access? dbpath))
(mdb (cond ;; what the hek is *toppath* doing here?
Index: tcp-transportmod.scm
==================================================================
--- tcp-transportmod.scm
+++ tcp-transportmod.scm
@@ -28,25 +28,52 @@
(use address-info tcp)
(module tcp-transportmod
*
- (import scheme
- (prefix sqlite3 sqlite3:)
- chicken
- data-structures
+(import scheme)
- address-info
- directory-utils
+(cond-expand
+ (chicken-4
+ (import (prefix sqlite3 sqlite3:)
+ chicken
extras
- files
hostinfo
+
+ ports
+ posix
+ files
+ data-structures
+ tcp
+ ))
+ (chicken-5
+ (import chicken.base
+ chicken.condition
+ chicken.file
+ chicken.pathname
+ chicken.process-context.posix
+ chicken.process
+ chicken.sort
+ chicken.string
+ chicken.time
+ chicken.tcp
+ chicken.random
+ chicken.file.posix
+ chicken.pretty-print
+ chicken.io
+ chicken.port
+ chicken.process-context
+
+ system-information)
+ (define unsetenv unset-environment-variable!)
+ ))
+
+ (import address-info
+ directory-utils
matchable
md5
message-digest
- ports
- posix
regex
regex-case
s11n
srfi-1
srfi-18
@@ -53,11 +80,10 @@
srfi-4
srfi-69
stack
typed-records
tcp-server
- tcp
debugprint
commonmod
dbfile
dbmod
@@ -110,11 +136,12 @@
;; parameters
;;
(define tt-server-timeout-param (make-parameter 600))
;; make ttdat visible
-(define *server-info* #f)
+;; (define *server-info* #f) ;; get this from commonmod
+(define *server-run* #t)
(define (tt:make-remote areapath)
(make-tt areapath: areapath))
;; 1 ... or #f
@@ -125,33 +152,46 @@
(and (or (number? run-id)
(not run-id))
(equal? (dbfile:run-id->dbfname run-id) dbfname)))
(tcp-buffer-size 2048)
-;; (max-connections 4096)
+;; (max-connections 4096)
+
+(define (tt:get-conn ttdat dbfname)
+ (hash-table-ref/default (tt-conns ttdat) dbfname #f))
;; do all the busy work of finding and setting up conn for
;; connecting to a server
;;
-(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
+(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)
(assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
- (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
- (server-start-proc (lambda ()
- (tt:server-process-run
- (tt-areapath ttdat)
- testsuite ;; (dbfile:testsuite-name)
- (common:find-local-megatest)
- run-id))))
+ (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
+ (let* ((conn (tt:get-conn ttdat dbfname))
+ (server-start-proc (or server-start-proc
+ (lambda ()
+ (assert (equal? dbfname "main.db") ;; only main.db is started here
+ "FATAL: called server-start-proc for db other than main.db")
+ (tt:server-process-run
+ (tt-areapath ttdat)
+ testsuite ;; (dbfile:testsuite-name)
+ (common:find-local-megatest)
+ run-id)))))
(if conn
(begin
- ; (debug:print-info 0 *default-log-port* "already connected to the server")
+ (debug:print-info 2 *default-log-port* "already connected to a server")
conn) ;; we are already connected to the server
- (let* ((sdat (tt:get-current-server-info ttdat dbfname)))
- (match sdat
+
+ ;; no conn
+ (let* ((sdats (tt:get-server-info-sorted ttdat dbfname))
+ (sdat (if (null? sdats)
+ #f
+ (car sdats))))
+ (debug:print-info 2 *default-log-port* "found sdat " sdat)
+ (match sdat
((host port start-time server-id pid dbfname2 servinffile)
(assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
- ;(debug:print-info 0 *default-log-port* "in match servinffile:" servinffile)
+ (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
(let* ((host-port (conc host":"port))
(conn (make-tt-conn
host: host
port: port
host-port: host-port
@@ -162,36 +202,52 @@
pid: pid)))
;; verify we can talk to this server
(let* ((result (tt:timed-ping host port server-id))
(ping-res (car result))
(ping (cdr result)))
- (debug:print-info 0 *default-log-port* "ping time: " ping)
+ (debug:print-info 2 *default-log-port* "host " host " port " port " ping time: " ping " result " ping-res)
(case ping-res
((running)
+ (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table")
(hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good?
conn)
((starting)
(thread-sleep! 0.5)
- (tt:client-connect-to-server ttdat dbfname run-id testsuite))
+ (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect")
+ (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))
(else
(let* ((curr-secs (current-seconds)))
;; rm the (last server) would go here
(if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
(begin
+ (debug:print-info 0 *default-log-port* "Unreachable server at "
+ host":"port" with servinfo file "servinffile", removing it")
+ (if (file-exists? servinffile)
+ (handle-exceptions
+ exn
+ #f
+ (delete-file servinffile)))
(tt-last-serv-start-set! ttdat curr-secs)
- (server-start-proc))) ;; start server if 30 sec since last attempt
+ (debug:print-info 0 *default-log-port* "Starting a new server on " (get-host-name))
+ (server-start-proc))) ;; start server if 10 sec since last attempt
(thread-sleep! 1)
- (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
+ (debug:print-info 0 *default-log-port* "Retrying connect")
+ (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
+
(else ;; no good server found, if haven't started server in > 5 secs, start another
- (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; BUG - grow this number really do not want to swamp the machine with servers
+ (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
(begin
- (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
+ (debug:print-info 0 *default-log-port* "Starting server for "dbfname " on " (get-host-name))
(server-start-proc)
- (tt-last-serv-start-set! ttdat (current-seconds))))
+ (tt-last-serv-start-set! ttdat (current-seconds))
+ (thread-sleep! 6)
+ ))
(thread-sleep! 1)
- (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
+ (debug:print-info 0 *default-log-port* "Connect to server from " (get-host-name) " for " dbfname)
+ (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
+;; returns ( result . ping_time )
(define (tt:timed-ping host port server-id)
(let* ((start-time (current-milliseconds))
(result (tt:ping host port server-id)))
(cons result (- (current-milliseconds) start-time))))
@@ -222,18 +278,18 @@
;; client side handler
;;
;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
-(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
- ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
- (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
+(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)
+ ;; connect-to-server will start a server if needed.
+ (let* ((areapath (tt-areapath ttdat))
+ (conn (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname
(if conn
;; have connection, call the server
(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
;; res is (status errmsg result meta)
- ; (debug:print 0 *default-log-port* "conn:" conn " res: " res)
(match res
((status errmsg result meta)
(if (list? meta)
(let* ((delay-wait (alist-ref 'delay-wait meta)))
(if (and (number? delay-wait)
@@ -241,35 +297,36 @@
(begin
(debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
(thread-sleep! delay-wait)))))
(case status
((busy) ;; result will be how long the server wants you to delay
- (let* ((dly (if (number? result) result 0.1)))
- (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, will try again in "dly" seconds.")
+ (let* ((raw-dly (if (number? result) result 0.1))
+ (dly (+ raw-dly (/ attemptnum 10)))) ;; (* raw-dly (/ attemptnum 2))))
+ (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1))
(thread-sleep! dly)
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)))
((loaded)
(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
- result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+ result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe))
(else
result)))
(else ;; did not receive properly formated result
- (if (not res) ;; tt:handler is telling us that communication failed
+ (if (not res) ;; tt:send-receive telling us that communication failed
(let* ((host (tt-conn-host conn))
(port (tt-conn-port conn))
;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
(pid (tt-conn-pid conn))
;;(servinf (tt-conn-servinf-file conn)))
(servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath)
- (hash-table-set! (tt-conns ttdat) dbfname #f)
+ (hash-table-set! (tt-conns ttdat) dbfname #f) ;; clear out the conn for this dbfname to force finding new server
(if (and servinf (file-exists? servinf))
(begin
(if (< attemptnum 10)
(begin
(thread-sleep! 0.5)
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
(begin
(debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
(if (and (file-exists? servinf)
(> (- (current-seconds)(file-modification-time servinf)) 60))
(begin
@@ -276,33 +333,30 @@
(debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
(handle-exceptions
exn
#f
(delete-file* servinf))
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
(begin
;; start server - addressed in client-connect-to-server
;; delay - addressed in client-connect-to-server
;; try again
(thread-sleep! 0.25) ;; dunno, I think this needs to be here
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
))))
(begin ;; no server file, delay and try again
- (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
+ (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ")
(thread-sleep! 0.5)
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))))
(begin ;; this case is where res is malformed. Probably should abort
(assert #f "FATAL: tt:handler received bad data "res)
;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
- ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)
+ ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)
)))))
(begin
(thread-sleep! 1) ;; no conn yet set up, give it a rest and try again
- (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))
-
-(define (tt:bid-for-servership run-id)
- #f)
+ (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)))))
;; gets server info and appends path to server file
;; sorts by age, oldest first
;;
;; returns list of (host port startseconds server-id servinfofile)
@@ -325,24 +379,10 @@
(debug:print 2 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", ")))
(set! count (+ count 1)))
sorted)
sorted))
-(define (tt:get-current-server-info ttdat dbfname)
- (assert (tt-areapath ttdat) "FATAL: areapath not set in ttdat.")
- ;;
- ;; TODO - replace most of below with tt;get-server-info-sorted
- ;;
- (let* ((areapath (tt-areapath ttdat))
- (sfiles (tt:find-server areapath dbfname))
- (sdats (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read
- (sorted (sort sdats (lambda (a b)
- (< (list-ref a 2)(list-ref b 2))))))
- (if (null? sorted)
- #f ;; we'll want to wait until extra servers have exited
- (car sorted))))
-
(define (tt:send-receive ttdat conn cmd run-id params)
(let* ((host-port (tt-conn-host-port conn)) ;; (conc (tt-conn-host conn)":"(tt-conn-port conn)))
(host (tt-conn-host conn))
(port (tt-conn-port conn))
(dat (list cmd run-id params #f))) ;; no meta data yet
@@ -379,18 +419,18 @@
(- wait-delay adj))
0)))
(if (> new-wait 0)
(begin
(if (common:low-noise-print 10 "delay wait message")
- (debug:print-info 0 *default-log-port* "Server loaded, DelayWait: "new-wait))
+ (debug:print-info 0 *default-log-port* "Server on host " host " loaded, DelayWait: "new-wait))
(tt:backoff-wait-delay-set! bkoff new-wait)
(tt:backoff-last-adj-t-set! bkoff (current-seconds))
(thread-sleep! new-wait))
(hash-table-delete! *tt:backoff-smoothing* host-port))))))
(define (tt:send-receive-direct host port dat #!key (ping-mode #f)(tries-remaining 25))
- (assert (number? port) "FATAL: tt:send-receive-direct called with port not a number "port)
+ (assert (number? port) "FATAL: tt:send-receive-direct called with a port that is not a number "port)
(tt:backoff-decr-and-wait host port)
(let* ((retry (lambda ()
(tt:send-receive-direct host port dat tries-remaining: (- tries-remaining 1))))
(full-err-print (lambda (exn msg)
(if (condition? exn)
@@ -457,244 +497,211 @@
;; start the listener and start responding to requests
;;
;; NOTE: organise by dbfname, not run-id so we don't need
;; to pull in more modules
;;
-;; This is the routine called in megatest.scm to start a server
+;; This is the routine called in megatest.scm to start a server. NOTE: sequence is different for main.db vs. X.db
;;
;; Server viability is checked in keep-running. Blindly start and run here.
;;
(define (tt:start-server areapath run-id dbfname-in handler keys)
(assert areapath "FATAL: areapath not provided for tt:start-server")
- ;; is there already a server for this dbfile? Then exit.
(let* ((ttdat (make-tt areapath: areapath))
- (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
- (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
- (if (> (length servers) 4)
- (begin
- (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
- (exit))
- (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
- (tt-handler-set! ttdat (handler dbstruct))
- (let* ((tcp-thread (make-thread
- (lambda ()
- (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
- "tcp-server-thread"))
- (run-thread (make-thread
- (lambda ()
- (tt:keep-running ttdat dbfname dbstruct)))))
- (thread-start! tcp-thread)
- (thread-start! run-thread)
-
- (let* ((areapath (tt-areapath ttdat))
- (nosyncdbpath (conc areapath"/.mtdb")))
- ;; this didn't seem to work, is port not available yet?
- (let loop ((count 0))
- (if (tt-port ttdat)
- (begin
- (procinf-port-set! *procinf* (tt-port ttdat))
- (procinf-dbname-set! *procinf* dbfname)
- (dbfile:with-no-sync-db
- nosyncdbpath
- (lambda (nsdb)
- (dbfile:insert-or-update-process nsdb *procinf*))))
- (if (< count 5)
- (begin
- (thread-sleep! 0.5)
- (loop (+ count 1)))
- (debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set!"))))
-
- (thread-join! run-thread) ;; run thread will exit on timeout or other conditions
- ;; replace with call to (dbfile:set-process-done nsdb host pid reason)
- (procinf-status-set! *procinf* "done")
- (procinf-end-set! *procinf* (current-seconds))
- (dbfile:with-no-sync-db
- nosyncdbpath
- (lambda (nsdb)
- (dbfile:insert-or-update-process nsdb *procinf*)))
- (debug:print 0 *default-log-port* "Exiting now.")
- (exit)))))))
+ (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))))
+ (set! *server-info* ttdat)
+ (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
+ (tt-handler-set! ttdat (handler dbstruct))
+ (let* ((servinf-created #f)
+ (tcp-thread (make-thread
+ (lambda ()
+ ;; NOTE: tt-port and tt-host are set in connect-listener which is called under tt:start-tcp-server
+ (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data
+ "tcp-server-thread"))
+ (run-thread (make-thread
+ (lambda ()
+ (tt:keep-running ttdat dbfname dbstruct)))))
+ (thread-start! tcp-thread)
+
+ (let* ((areapath (tt-areapath ttdat))
+ (nosyncdbpath (conc areapath"/.mtdb"))
+ (servers ;; (tt:find-server areapath dbfname)))
+ (tt:get-server-info-sorted ttdat dbfname)) ;; (host port startseconds server-id servinfofile)
+ (good-srvrs
+ ;; contact servers via ping, if no response remove the .servinfo file
+ (let loop ((servrs servers)
+ (prime-host #f)
+ (result '()))
+ (if (null? servrs)
+ (reverse result)
+ (let* ((servdat (car servrs)))
+ (match servdat
+ ((host port startseconds server-id servinfofile)
+ (let* ((ping-res (tt:timed-ping host port server-id))
+ (good-ping (match ping-res
+ ((result . ping-time)
+ (not result)) ;; we couldn't reach the server or it was not a megatest server
+ (else #f))) ;; the ping failed completely?
+ (same-host (or (not prime-host) ;; i.e. this is the first host
+ (equal? prime-host host)))
+ (keep-srv (and good-ping same-host)))
+ (if keep-srv
+ (loop (cdr servrs)
+ host
+ (cons servdat result))
+ (begin
+ (handle-exceptions
+ exn
+ (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile", "
+ (condition->list exn))
+ (delete-file* servinfofile))
+ (loop (cdr servrs) prime-host result)))))
+ (else
+ ;; can't delete it as we don't have a filename. NOTE: Should really never get here.
+ (debug:print-info 0 *default-log-port* "ERROR: bad servinfo record \""servdat"\"")
+ (loop (cdr servrs) prime-host result)) ;; drop
+ )))))
+ (home-host (if (null? good-srvrs)
+ #f
+ (caar good-srvrs))))
+ ;; by here we have a trustworthy list of servers and we have removed the .servinfo file for any unresponsive servers
+ ;; and the list is in good-srvrs
+ (cond
+ ((not home-host) ;; no servers yet, go ahead and start
+ (debug:print-info 0 *default-log-port* "No servers yet, starting on "(get-host-name)))
+ ((> (length good-srvrs) 2) ;; don't need more, just exit
+ (debug:print-info 0 *default-log-port* "Have "(length good-srvrs)", no need for more, exiting.")
+ (exit))
+ ((not (equal? home-host (get-host-name))) ;; there is a home-host and we are not on it
+ (debug:print-info 0 *default-log-port* "Prime main server is on host "home-host", but we are on host "(get-host-name)", exiting.")
+ (exit))
+ (else
+ (debug:print-info 0 *default-log-port* "Starting on host "(get-host-name)", along with "(length good-srvrs)" other servers.")))
+
+ ;; this didn't seem to work, is port not available yet?
+ (let loop ((count 0))
+ (if (tt-port ttdat)
+ (begin
+ (procinf-port-set! *procinf* (tt-port ttdat))
+ (procinf-dbname-set! *procinf* dbfname)
+ (dbfile:with-no-sync-db
+ nosyncdbpath
+ (lambda (nsdb)
+ (dbfile:insert-or-update-process nsdb *procinf*))))
+ (if (< count 10)
+ (begin
+ (thread-sleep! 0.25)
+ (loop (+ count 1)))
+ (begin
+ (debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set! Exiting.")
+ (exit)))))
+
+ ;; create a servinfo file start keep-running
+ (debug:print 0 *default-log-port* "Creating servinfo file for " dbfname)
+ (tt:create-server-registration-file ttdat dbfname)
+ (procinf-status-set! *procinf* "running")
+ (tt-state-set! ttdat 'running)
+ (dbfile:with-no-sync-db
+ nosyncdbpath
+ (lambda (nsdb)
+ (dbfile:insert-or-update-process nsdb *procinf*)))
+ (thread-start! run-thread)
+
+ (thread-join! run-thread) ;; run thread will exit on timeout or other conditions
+
+ ;; (tcp-close (tt-socket ttdat)) ;; close up ports here
+
+ ;; replace with call to (dbfile:set-process-done nsdb host pid reason)
+ (procinf-status-set! *procinf* "done")
+ (procinf-end-set! *procinf* (current-seconds))
+ ;; either convert this to use set-process-done or get rid of set-process-done
+ (dbfile:with-no-sync-db
+ nosyncdbpath
+ (lambda (nsdb)
+ (dbfile:insert-or-update-process nsdb *procinf*)))
+ (debug:print 0 *default-log-port* "Exiting now.")
+ (exit))))))
(define (tt:keep-running ttdat dbfname dbstruct)
- ;; verfiy conn for ready
- ;; listener socket has been started by this stage
- ;; wait for a port before creating the registration file
- ;;
- (let* ((db-locked-in #f)
- (areapath (tt-areapath ttdat))
- (nosyncdbpath (conc areapath"/.mtdb"))
- (cleanup (lambda ()
- (if (tt-cleanup-proc ttdat)
- ((tt-cleanup-proc ttdat)))
- (dbfile:with-no-sync-db nosyncdbpath
- (lambda (db)
- (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
- (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
- (db:no-sync-del! db dbfname)
- #;(if dbtmpname
- (delete-file dbtmpname))))))))
- (set! *server-info* ttdat)
- (let loop ((count 0))
- (if (> count 240)
- (begin
- (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
- (exit 1))
- (if (not (tt-port ttdat)) ;; no connection yet
- (begin
- (thread-sleep! 0.25)
- (loop (+ count 1))))))
-
- (tt:create-server-registration-file ttdat dbfname)
- ;; now start watching the last-access, if it hasn't been touched
- ;; in over ten seconds we exit
- (thread-sleep! 0.05) ;; any real need for delay here?
+
+ ;; at this point the server is running and responding to calls, we just monitor
+ ;; for db calls and exit if there are none.
+
+ ;; if I am not in the first 3 servers, exit
+ (let* ((start-time (current-seconds)))
(let loop ()
- (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
- (ok (cond
- ((null? servers) #f) ;; not ok
- ((equal? (list-ref (car servers) 6) ;; compare the servinfofile
- (tt-servinf-file ttdat))
- (let* ((res (if db-locked-in
- #t
- (let* ((lock-result ;; this is the primary lock - need to double verify that got it
- (dbfile:with-no-sync-db
- nosyncdbpath
- (lambda (db)
- (db:no-sync-lock-and-check db dbfname
- (tt-servinf-file ttdat)
- ;; (dbr:dbstruct-dbtmpname dbstruct)
- ))))
- (success (car lock-result)))
- (if success
- (begin
- (tt-state-set! ttdat 'running)
- (debug:print 0 *default-log-port* "Got server lock for " dbfname)
- (set! db-locked-in #t)
- #t)
- (begin
- (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname)
- #f))))))
- (if (and res (common:low-noise-print 120 "top server message"))
- (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
- dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
- res))
- (else
- (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
- (let* ((leadsrv (car servers)))
- (match leadsrv
- ((host port startseconds server-id pid dbfname servinfofile)
- (let* ((result (tt:timed-ping host port server-id))
- (res (car result))
- (ping (cdr result)))
- (debug:print-info 0 *default-log-port* "Ping to "host":"port", with server-id "server-id
- ", and file "servinfofile" returned "res)
- (if res
- #f ;; not the server, but all good, want to exit
- (if (and (file-exists? servinfofile)
- (> (- (current-seconds)(file-modification-time servinfofile)) 30))
- (begin
- ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it
- (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile)
- (handle-exceptions
- exn
- (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile)
- (delete-file* servinfofile)
- )
- #t) ;; not the server but the server is not reachable
- (begin
- (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.")
- (thread-sleep! 1) ;; just because
- #t)))))
- (else ;; should never get here
- (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv)
- (assert #f "Bad server record "leadsrv))))))))
+ (let* ((servers (tt:get-server-info-sorted ttdat dbfname))
+ (home-host (if (null? servers)
+ #f
+ (caar servers)))
+ (my-index (list-index (lambda (x)
+ (equal? (list-ref x 6)
+ (tt-servinf-file ttdat)))
+ servers))
+ (ok (cond
+ ((not (number? my-index))
+ (debug:print 0 *default-log-port* "ERROR: bad server data in "servers", might be due to host misconfiguration such as bad IP address in /etc/hosts.")
+ #f)
+ ((not *server-run*)
+ (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.")
+ #f)
+ ((null? servers)
+ (debug:print 0 *default-log-port* "WARNING: no servinfo files found, this cannot be.")
+ #f) ;; not ok
+ ((> my-index 2)
+ (debug:print 0 *default-log-port* "WARNING: there are more than two servers ahead of me, I'm not needed, exiting.")
+ #f) ;; not ok to not be in first three
+ ((eq? (tt-state ttdat) 'running) #t) ;; we are good to keep going
+ ((> (- (current-seconds) start-time) 30)
+ (debug:print 0 *default-log-port* "WARNING: over 30 seconds and not yet in runnning mode. Exiting.")
+ #f)
+ (else #t))))
(if ok
(tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access
(begin
(debug:print 0 *default-log-port* "Exiting immediately")
- (cleanup)
+ (tt:shutdown-server ttdat)
(exit)))
(let* ((last-update (dbr:dbstruct-last-update dbstruct))
(curr-secs (current-seconds)))
(if (and (eq? (tt-state ttdat) 'running)
- (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db?
- (begin
- (set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds))
+ (> (- curr-secs last-update) 5)) ;; every 5 seconds update the db?
+ (let* ((sinfo-file (tt-servinf-file ttdat)))
+ ;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file)
+ (set! (file-modification-time sinfo-file) (current-seconds))
((dbr:dbstruct-sync-proc dbstruct) last-update)
(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
-
+
(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
(begin
(thread-sleep! 5)
(loop)))))
- (cleanup)
+ (tt:shutdown-server ttdat)
(debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running.")))
-
-;; ;; given an already set up uconn start the cmd-loop
-;; ;;
-;; (define (tt:cmd-loop ttdat)
-;; (let* ((serv-listener (-socket uconn))
-;; (listener (lambda ()
-;; (let loop ((state 'start))
-;; (let-values (((inp oup)(tcp-accept serv-listener)))
-;; ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP
-;; (let* ((rdat (deserialize inp)) ;; '(my-host-port qrykey cmd params)
-;; (resp (ulex-handler uconn rdat)))
-;; (serialize resp oup)
-;; (close-input-port inp)
-;; (close-output-port oup)
-;; ;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP
-;; )
-;; (loop state))))))
-;; ;; start N of them
-;; (let loop ((thnum 0)
-;; (threads '()))
-;; (if (< thnum 100)
-;; (let* ((th (make-thread listener (conc "listener" thnum))))
-;; (thread-start! th)
-;; (loop (+ thnum 1)
-;; (cons th threads)))
-;; (map thread-join! threads)))))
-;;
-;;
-;;
-;; (define (wait-and-close uconn)
-;; (thread-join! (udat-cmd-thread uconn))
-;; (tcp-close (udat-socket uconn)))
-;;
-;;
(define (tt:shutdown-server ttdat)
- (let* ((cleanproc (tt-cleanup-proc ttdat))
- (port (tt-port ttdat)))
+ (let* ((host (tt-host ttdat))
+ (port (tt-port ttdat))
+ (sinf (tt-servinf-file ttdat)))
(tt-state-set! ttdat 'shutdown)
(portlogger:open-run-close portlogger:set-port port "released")
- (if cleanproc (cleanproc))
- (tcp-close (tt-socket ttdat)) ;; close up ports here
+ (if (file-exists? sinf)
+ (delete-file* sinf))
))
-;; (define (wait-and-close uconn)
-;; (thread-join! (tt-cmd-thread uconn))
-;; (tcp-close (tt-socket uconn)))
-
;; return servid
;; side-effects:
;; ttdat-cleanup-proc is populated with function to remove the serverinfo file
(define (tt:create-server-registration-file ttdat dbfname)
(let* ((areapath (tt-areapath ttdat))
(servdir (tt:get-servinfo-dir areapath))
(host (tt-host ttdat))
(port (tt-port ttdat))
(servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname))
- (serv-id (tt:mk-signature areapath))
- (clean-proc (lambda ()
- (delete-file* servinf)
- )))
+ (serv-id (tt:mk-signature areapath)))
(assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname)
- (tt-cleanup-proc-set! ttdat clean-proc)
(tt-servinf-file-set! ttdat servinf)
(with-output-to-file servinf
(lambda ()
(print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname)))
serv-id))
@@ -704,12 +711,28 @@
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other : files
;;
(define (tt:find-server areapath dbfname)
(let* ((servdir (tt:get-servinfo-dir areapath))
- (sfiles (glob (conc servdir"/*:"dbfname))))
- sfiles))
+ (sfiles (glob (conc servdir"/*:"dbfname)))
+ (goodfiles '()))
+
+ ;; filter the files here by looking in processes table (if we are not main.db)
+ ;; and or look at the time stamp on the servinfo file, a running server will
+ ;; touch the file every minute (again, this will only apply for main.db)
+ (for-each (lambda (fname)
+ (let* ((age (- (current-seconds)(file-modification-time fname))))
+ (if (> age 200) ;; can't trust it if over 200 seconds old
+ (begin
+ (debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname", it is "age" seconds old")
+ (handle-exceptions
+ exn
+ (debug:print 0 *default-log-port* "WARNING: error attempting to remove stale servinfo file "fname)
+ (delete-file fname))) ;;
+ (set! goodfiles (cons fname goodfiles)))))
+ sfiles)
+ goodfiles))
;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
;; example of what it's looking for in the log file:
;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4
;;
@@ -719,12 +742,13 @@
(dbprep-found 0)
(bad-dat (list #f #f #f #f #f #f logf)))
(let ((fdat (handle-exceptions
exn
(begin
- ;; WARNING: this is potentially dangerous to blanket ignore the errors
- (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))
+ ;; BUG, TODO: add err checking, for now blanket ignore the errors?
+ (debug:print-info 0 *default-log-port* "Unable to get server info from "logf
+ ", exn="(condition->list exn))
'()) ;; no idea what went wrong, call it a bad server, return empty list
(with-input-from-file logf read-lines))))
(if (null? fdat) ;; bad data, return bad-dat
bad-dat
(let loop ((inl (car fdat))
@@ -750,10 +774,17 @@
logf))
(else
(debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
bad-dat)))))))))
+(define *last-server-start* (make-hash-table))
+
+(define (tt:too-recent-server-start dbfname)
+ (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f)))
+ (and last-run-time
+ (< (- (current-seconds) last-run-time) 5))))
+
;; Given an area path, start a server process ### NOTE ### > file 2>&1
;; if the target-host is set
;; try running on that host
;; incidental: rotate logs in logs/ dir.
;;
@@ -760,51 +791,63 @@
(define (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
(assert areapath "FATAL: tt:server-process-run called without areapath defined.")
(assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
(assert mtexe "FATAL: tt:server-process-run called without mtexe defined.")
;; mtest -server - -m testsuite:ext-tests -db 6.db
- (let* ((dbfname (dbmod:run-id->dbfname run-id))
- (load (get-normalized-cpu-load))
- (trying (length (tt:find-server areapath dbfname)))
- (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
- (cond
- ((> load 2.0)
- (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.")
- (thread-sleep! 1))
- ((> nrun 100)
- (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
- (thread-sleep! 1))
- ((> trying 4)
- (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
- (thread-sleep! 1))
- (else
- (if (not (file-exists? (conc areapath"/logs")))
- (create-directory (conc areapath"/logs") #t))
- (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
- (cmdln (conc
- mtexe
- " -startdir "areapath
- " -server - ";; (or target-host "-")
- " -m testsuite:"testsuite
- " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
- " " profile-mode
- (conc " >> " logfile " 2>&1 &"))))
- ;; we want the remote server to start in *toppath* so push there
- ;; (push-directory areapath) ;; use cd in the command line instead
- (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
- ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
-
- (system cmdln)
- ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
- ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
- ;; (setenv "NBFAKE_LOG" logfile)
- ;; (system (conc "cd "areapath" ; nbfake " cmdln))
- ;; (unsetenv "NBFAKE_QUIET")
- ;; (unsetenv "NBFAKE_LOG")
-
- ;;(pop-directory)
- )))))
+ (let* ((dbfname (dbmod:run-id->dbfname run-id)))
+ (if (tt:too-recent-server-start dbfname)
+ #f
+ (let* ((load (get-normalized-cpu-load))
+ (srvrs (tt:find-server areapath dbfname))
+ (trying (length srvrs))
+ (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
+ (cond
+ ((> load 2.0)
+ (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
+ (thread-sleep! 1)
+ #f)
+ ((> nrun 100)
+ (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
+ (thread-sleep! 1)
+ #f)
+ ((> trying 2)
+ (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
+ (thread-sleep! 1)
+ #f)
+ (else
+ (if (not (file-exists? (conc areapath"/logs")))
+ (create-directory (conc areapath"/logs") #t))
+ (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
+ (cmdln (conc
+ mtexe
+ " -startdir "areapath
+ " -server - ";; (or target-host "-")
+ " -m testsuite:"testsuite
+ " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
+ " " profile-mode
+ #;(conc " >> " logfile " 2>&1 &"))))
+ ;; we want the remote server to start in *toppath* so push there
+ ;; (push-directory areapath) ;; use cd in the command line instead
+ (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
+ ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
+
+ (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
+ (setenv "NBFAKE_LOG" logfile)
+ (system (conc "cd "areapath" ; nbfake " cmdln))
+ (unsetenv "NBFAKE_QUIET")
+ (unsetenv "NBFAKE_LOG")
+ ;; (system cmdln)
+ (hash-table-set! *last-server-start* dbfname (current-seconds))
+ ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
+ ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
+ ;; (setenv "NBFAKE_LOG" logfile)
+ ;; (system (conc "cd "areapath" ; nbfake " cmdln))
+ ;; (unsetenv "NBFAKE_QUIET")
+ ;; (unsetenv "NBFAKE_LOG")
+
+ ;;(pop-directory)
+ #t)))))))
;;======================================================================
;; tcp connection stuff
;;======================================================================
@@ -868,18 +911,23 @@
;; (connect-listener uconn port)))
(define (setup-listener-portlogger uconn)
(let ((port (portlogger:open-run-close portlogger:find-port)))
(assert (tt? uconn) "FATAL: setup-listener called with wrong struct "uconn)
+ (debug:print 2 *default-log-port* "setup-listener-portlogger got port " port)
(handle-exceptions
exn
(if (< port 65535)
(begin
(portlogger:open-run-close portlogger:set-failed port)
(thread-sleep! 0.25)
(setup-listener-portlogger uconn))
- #f)
+ (begin
+ (debug:print 0 *default-log-port* "setup-listener-portlogger: could not get a port")
+ #f
+ )
+ )
(connect-listener uconn port))))
(define (connect-listener uconn port)
;; (tcp-listener-socket LISTENER)(socket-name so)
;; sockaddr-address, sockaddr-port, sockaddr->string
Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1435,19 +1435,19 @@
(out-dir (db:test-get-rundir test-dat))
(status-file (conc out-dir "/.final-status"))
)
;; first verify we are able to write the output file
(if (not (file-write-access? out-dir))
- (debug:print 0 *default-log-port* "ERROR: cannot write .final-status to " out-dir)
- (let*
- ((outp (open-output-file status-file))
+ (debug:print 0 *default-log-port* "ERROR: cannot write .final-status to " out-dir)
+ (let* ((outp (open-output-file status-file))
(status (db:test-get-status test-dat))
- (state (db:test-get-state test-dat)))
- (fprintf outp "~S\n" state)
- (fprintf outp "~S\n" status)
- (close-output-port outp)))))
-
+ (state (db:test-get-state test-dat)))
+ (with-output-to-port outp
+ (lambda ()
+ (print state) ;; printf was putting in ", not sure why but that was a hassle in other contexts
+ (print status)))
+ (close-output-port outp)))))
;; summarize test in to a file test-summary.html in the test directory
;;
(define (tests:summarize-test run-id test-id)
(let* ((test-dat (rmt:get-test-info-by-id run-id test-id))
@@ -1964,11 +1964,12 @@
;; test steps
;;======================================================================
;; teststep-set-status! used to be here
-(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
+;; NOT NEEDED
+#;(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
(let* ((testdat (rmt:get-test-state-status-by-id run-id test-id)))
(and testdat
(equal? (car testdat) "KILLREQ"))))
(define (test:tdb-get-rundat-count tdb)
@@ -1993,11 +1994,11 @@
;; This one is for running with no db access (i.e. via rmt: internally)
(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
;; (let ((remtries 10))
- (let* ((cpuload (get-cpu-load))
+ (let* ((cpuload (commonmod:get-cpu-load))
(diskfree (get-df (current-directory)))
(uname (get-uname "-srvpio"))
(hostname (get-host-name)))
(tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
ADDED transport-mode.scm
Index: transport-mode.scm
==================================================================
--- /dev/null
+++ transport-mode.scm
@@ -0,0 +1,22 @@
+;;======================================================================
+;; set up transport, db cache and sync methods
+;;
+;; sync-method: 'original, 'attach or 'none
+;; cache-method: 'tmp 'none
+;; rmt:transport-mode: 'http, 'tcp, 'nfs
+;;
+;; NOTE: NOT ALL COMBINATIONS WORK
+;;
+;;======================================================================
+
+;; uncomment this block to test without tcp
+;; (dbfile:sync-method 'none)
+;; (dbfile:cache-method 'none)
+;; (rmt:transport-mode 'nfs)
+
+;; uncomment this block to test with tcp
+(dbfile:sync-method 'attach) ;; attach) ;; original
+(dbfile:cache-method 'tmp)
+(rmt:transport-mode 'tcp)
+
+
Index: utils/mt_xterm
==================================================================
--- utils/mt_xterm
+++ utils/mt_xterm
@@ -20,18 +20,16 @@
MT_TMPDISPLAY=$DISPLAY
MT_TMPUSER=$USER
MT_HOME=$HOME
tmpfile=`mktemp`
-
-grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile
-source $tmpfile
-rm $tmpfile
-
-# if [ -e megatest.sh ];then
-#source megatest.sh
-#fi
+if [[ -e megatest.sh ]]; then
+ grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile
+ source $tmpfile
+ rm $tmpfile
+fi
+
export DISPLAY=$MT_TMPDISPLAY
export USER=$USER
export HOME=$MT_HOME
if [ x"$MT_XTERM_CMD" == "x" ];then