Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,21 @@
# along with Megatest. If not, see .
TODO
====
+23WW46 - v1.80 branch
+. Use file semaphore to kill tests, eliminate db load of the KILLREQ query
+. Merge this change to revolution branch
+23WW45 - the revolution branch
+. Add "fast" db start option (no handshaking over NFS)
+. Add server-ro to server types (just "server" is fine for read/write).
+. [DONE] Create pause-server and resume-server calls
+. Create rsync or cp sync to MTRAH function
+. Change rmt:send-receive to divert calls to read-only server when possible
+. [DONE] Change start server to call main.db server for 1..N.db servers, block until server is read for use.
+
23WW21
. Dashboard needs its own cache db in /tmp
23WW07
. Remove use of *dbstruct-dbs*
Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -35,11 +35,12 @@
(use srfi-69
srfi-18
posix
matchable
- s11n)
+ s11n
+ typed-records)
;; allow these queries through without starting a server
;;
(define api:read-only-queries
'(get-key-val-pairs
@@ -152,78 +153,10 @@
tasks-set-state-given-param-key
))
(define *db-write-mutexes* (make-hash-table))
(define *server-signature* #f)
-;; ;; These are called by the server on recipt of /api calls
-;; ;; - keep it simple, only return the actual result of the call, i.e. no meta info here
-;; ;;
-;; ;; - returns #( flag result )
-;; ;;
-;; (define (api:execute-requests dbstruct dat)
-;; (if (> *api-process-request-count* 50)
-;; (begin
-;; (if (common:low-noise-print 30 "too many threads")
-;; (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay."))
-;; ;; (thread-sleep! 0.5) ;; take a nap - no, the napping is moved to the clients via tt:backoff-incr
-;; ))
-;; (cond
-;; ((not (vector? dat)) ;; it is an error to not receive a vector
-;; (vector #f (vector #f "remote must be called with a vector")))
-;; (else
-;; (let* ((cmd-in (vector-ref dat 0))
-;; (cmd (if (symbol? cmd-in)
-;; cmd-in
-;; (string->symbol cmd-in)))
-;; (params (vector-ref dat 1))
-;; (run-id (if (null? params)
-;; 0
-;; (car params)))
-;; (write-mutex (if (hash-table-exists? *db-write-mutexes* run-id)
-;; (hash-table-ref *db-write-mutexes* run-id)
-;; (let* ((newmutex (make-mutex)))
-;; (hash-table-set! *db-write-mutexes* run-id newmutex)
-;; newmutex)))
-;; (start-t (current-milliseconds))
-;; (readonly-mode (dbr:dbstruct-read-only dbstruct))
-;; (readonly-command (member cmd api:read-only-queries))
-;; (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))))
-;; (if (not readonly-command)
-;; (mutex-lock! write-mutex))
-;; (let* ((tmppath (dbr:dbstruct-tmppath dbstruct))
-;; (clean-run-id (cond
-;; ((number? run-id) run-id)
-;; ((equal? run-id #f) "main")
-;; (else "other")))
-;; (crumbfile (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params)))
-;; (res
-;; (if writecmd-in-readonly-mode
-;; (conc "attempt to run write command "cmd" on a read-only database")
-;; (api:dispatch-request dbstruct cmd run-id params))))
-;; (delete-file* crumbfile)
-;; (if (not readonly-command)
-;; (mutex-unlock! write-mutex))
-;;
-;; ;; save all stats
-;; (let ((delta-t (- (current-milliseconds)
-;; start-t))
-;; (modified-cmd (if (eq? cmd 'general-call)
-;; (string->symbol (conc "general-call-" (car params)))
-;; cmd)))
-;; (hash-table-set! *db-api-call-time* modified-cmd
-;; (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '()))))
-;; (if writecmd-in-readonly-mode
-;; (begin
-;; #;(common:telemetry-log (conc "api-out:"(->string cmd))
-;; payload: `((params . ,params)
-;; (ok-res . #t)))
-;; (vector #f res))
-;; (begin
-;; #;(common:telemetry-log (conc "api-out:"(->string cmd))
-;; payload: `((params . ,params)
-;; (ok-res . #f)))
-;; (vector #t res))))))))
(define *api-threads* '())
(define (api:register-thread th-in)
(set! *api-threads* (cons (cons th-in (current-seconds)) *api-threads*)))
@@ -237,11 +170,25 @@
(not (member (thread-state (car thdat)) '(terminated dead))))
*api-threads*)))
(define (api:get-count-threads-alive)
(length *api-threads*))
-
+
+(define *api:last-stats-print* 0)
+(define *api-print-db-stats-mutex* (make-mutex))
+(define (api:print-db-stats)
+ (debug:print-info 0 *default-log-port* "Started periodic db stats printer")
+ (let loop ()
+ (mutex-lock! *api-print-db-stats-mutex*)
+ (if (> (- (current-seconds) *api:last-stats-print*) 15)
+ (begin
+ (rmt:print-db-stats)
+ (set! *api:last-stats-print* (current-seconds))))
+ (mutex-unlock! *api-print-db-stats-mutex*)
+ (thread-sleep! 5)
+ (loop)))
+
;; indat is (cmd run-id params meta)
;;
;; WARNING: Do not print anything in the lambda of this function as it
;; reads/writes to current in/out port
@@ -250,85 +197,105 @@
(assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.")
(if (not *server-signature*)
(set! *server-signature* (tt:mk-signature *toppath*)))
(lambda (indat)
(api:register-thread (current-thread))
- (let* (;; (indat (deserialize))
- (newcount (+ *api-process-request-count* 1))
- (numthreads (api:get-count-threads-alive))
- (delay-wait (if (> newcount 10)
- (- newcount 10)
- 0))
- (normal-proc (lambda (cmd run-id params)
- (case cmd
- ((ping) *server-signature*)
- (else
- (api:dispatch-request dbstruct cmd run-id params))))))
- (set! *api-process-request-count* newcount)
- (set! *db-last-access* (current-seconds))
- (if (not (eq? newcount numthreads))
- (begin
- (api:remove-dead-or-terminated)
- (let ((threads-now (api:get-count-threads-alive)))
- (debug:print 0 *default-log-port* "WARNING: newcount="newcount", numthreads="numthreads", remaining="threads-now)
- (set! newcount threads-now))))
- (match indat
- ((cmd run-id params meta)
- (let* ((db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id))
- (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct))))
- (case cmd
- ((ping) #t) ;; we are fine
- (else
- (if (not ok)(debug:print 0 *default-log-port* "ERROR: "cmd", run-id "run-id", not correct for dbfname "(dbr:dbstruct-dbfname dbstruct)))
- (assert ok "FATAL: database file and run-id not aligned.")))))
- (ttdat *server-info*)
- (server-state (tt-state ttdat))
- (status (cond
- ((> newcount 3) 'busy)
- ;; ((> newcount 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down.
- (else 'ok)))
- (errmsg (case status
- ((busy) (conc "Server overloaded, "newcount" threads in flight"))
- ((loaded) (conc "Server loaded, "newcount" threads in flight"))
- (else #f)))
- (result (case status
- ((busy)
- (if (eq? cmd 'ping)
- (normal-proc cmd run-id params)
- ;; newcount must be greater than 5 for busy
- (* 1 (- newcount 3)) ;; was 15
- )) ;; (- newcount 29)) ;; call back in as many seconds
- ((loaded)
-;; (if (eq? (rmt:transport-mode) 'tcp)
-;; (thread-sleep! 0.5))
- (normal-proc cmd run-id params))
- (else
- (normal-proc cmd run-id params))))
- (meta (case cmd
- ((ping) `((sstate . ,server-state)))
- (else `((wait . ,delay-wait)))))
- (payload (list status errmsg result meta)))
- (set! *api-process-request-count* (- *api-process-request-count* 1))
- ;; (serialize payload)
- (api:unregister-thread (current-thread))
- payload))
- (else
- (assert #f "FATAL: failed to deserialize indat "indat))))))
-
+ (let* ((result
+ (let* ((numthreads (api:get-count-threads-alive))
+ (delay-wait (if (> numthreads 10)
+ (- numthreads 10)
+ 0))
+ (normal-proc (lambda (cmd run-id params)
+ (case cmd
+ ((ping) *server-signature*)
+ (else
+ (api:dispatch-request dbstruct cmd run-id params))))))
+ (set! *api-process-request-count* numthreads)
+ (set! *db-last-access* (current-seconds))
+;; (if (not (eq? numthreads numthreads))
+;; (begin
+;; (api:remove-dead-or-terminated)
+;; (let ((threads-now (api:get-count-threads-alive)))
+;; (debug:print 0 *default-log-port* "WARNING: numthreads="numthreads", numthreads="numthreads", remaining="threads-now)
+;; (set! numthreads threads-now))))
+ (match indat
+ ((cmd run-id params meta)
+ (let* ((start-t (current-milliseconds))
+ (db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id))
+ (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct))))
+ (case cmd
+ ((ping) #t) ;; we are fine
+ (else
+ (assert ok "FATAL: database file and run-id not aligned.")))))
+ (ttdat *server-info*)
+ (server-state (tt-state ttdat))
+ (maxthreads 20) ;; make this a parameter?
+ (status (cond
+ ((and (> numthreads maxthreads)
+ (> (random 100) 70)) ;; allow a 30% probability to go through so we can figure out what is going wrong in main.db server.
+ 'busy)
+ ;; ((> numthreads 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down.
+ (else 'ok)))
+ (errmsg (case status
+ ((busy) (conc "Server overloaded, "numthreads" threads in flight"))
+ ((loaded) (conc "Server loaded, "numthreads" threads in flight"))
+ (else #f)))
+ (result (case status
+ ((busy)
+ (if (eq? cmd 'ping)
+ (normal-proc cmd run-id params)
+ ;; numthreads must be greater than 5 for busy
+ (* 0.1 (- numthreads maxthreads)) ;; was 15
+ )) ;; (- numthreads 29)) ;; call back in as many seconds
+ ((loaded)
+ ;; (if (eq? (rmt:transport-mode) 'tcp)
+ ;; (thread-sleep! 0.5))
+ (normal-proc cmd run-id params))
+ (else
+ (normal-proc cmd run-id params))))
+ (meta (case cmd
+ ((ping) `((sstate . ,server-state)))
+ (else `((wait . ,delay-wait)))))
+ (payload (list status errmsg result meta)))
+ ;; (cmd run-id params meta)
+ (db:add-stats cmd run-id params (- (current-milliseconds) start-t))
+ payload))
+ (else
+ (assert #f "FATAL: failed to deserialize indat "indat))))))
+ ;; (set! *api-process-request-count* (- *api-process-request-count* 1))
+ ;; (serialize payload)
+
+ (api:unregister-thread (current-thread))
+ result)))
+
+
+
+(define *api-halt-writes* #f)
(define (api:dispatch-request dbstruct cmd run-id params)
(if (not *no-sync-db*)
(db:open-no-sync-db))
+ (let* ((start-time (current-milliseconds)))
+ (if (member cmd api:write-queries)
+ (let loop ()
+ (if *api-halt-writes*
+ (begin
+ (thread-sleep! 0.2)
+ (if (< (- (current-milliseconds) start-time)
+ 5000) ;; hope it don't take more than five seconds to sync
+ (loop-time)
+ #;(debug:print 0 *default-log-port* "ERROR: writes halted for more than 5 seconds, sync might be taking too long"))))))
+ (db:add-stats 'api-write-blocking-for-sync run-id params (- (current-milliseconds) start-time)))
(case cmd
;;===============================================
;; READ/WRITE QUERIES
;;===============================================
((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl
;; SERVERS
- ((start-server) (apply server:kind-run params))
+ ((start-server) (apply tt:server-process-run params))
((kill-server) (set! *server-run* #f))
;; TESTS
;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params))
@@ -513,43 +480,5 @@
((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params))
(else
(debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
(conc "ERROR: BAD api call " cmd))))
-;; http-server send-response
-;; api:process-request
-;; db:*
-;;
-;; NB// Runs on the server as part of the server loop
-;;
-(define (api:process-request dbstruct $) ;; the $ is the request vars proc
- (debug:print 4 *default-log-port* "server-id:" *server-id*)
- (let* ((cmd ($ 'cmd))
- (paramsj ($ 'params))
- (key ($ 'key))
- (params (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?)
- (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key)
- (if (equal? key *server-id*)
- (begin
- (set! *api-process-request-count* (+ *api-process-request-count* 1))
- (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result )
- (success (vector-ref resdat 0))
- (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?)
- (debug:print 4 *default-log-port* "res:" res)
- (if (not success)
- (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params))
- (if (> *api-process-request-count* *max-api-process-requests*)
- (set! *max-api-process-requests* *api-process-request-count*))
- (set! *api-process-request-count* (- *api-process-request-count* 1))
- ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds
- ;; (rmt:dat->json-str
- ;; (if (or (string? res)
- ;; (list? res)
- ;; (number? res)
- ;; (boolean? res))
- ;; res
- ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res)))))
- (db:obj->string res transport: 'http)))
- (begin
- (debug:print 0 *default-log-port* "Server refused to process request. Server id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params)
- (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http)))))
-
Index: archive.scm
==================================================================
--- archive.scm
+++ archive.scm
@@ -359,11 +359,11 @@
(archive-dir (if archive-info (cdr archive-info) #f))
(archive-id (if archive-info (car archive-info) -1))
(home-host (server:choose-server *toppath* 'homehost))
(archive-time (seconds->std-time-str (current-seconds)))
(archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
- (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
+ (tmp-db-path (conc (dbfile:make-tmpdir-name *toppath* "") "/megatest.db"))
(dbfile (conc archive-staging-db "/megatest.db")))
(create-directory archive-staging-db #t)
(let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
(if (eq? exit-code 0)
(case archiver
Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -21,10 +21,11 @@
(declare (unit common))
(declare (uses commonmod))
(declare (uses rmtmod))
(declare (uses debugprint))
(declare (uses mtargs))
+
(use srfi-1 data-structures posix regex-case (prefix base64 base64:)
format dot-locking csv-xml z3 udp ;; sql-de-lite
hostinfo md5 message-digest typed-records directory-utils stack
matchable regex posix (srfi 18) extras ;; tcp
@@ -153,14 +154,10 @@
(define *time-zero* (current-seconds)) ;; for the watchdog
(define *on-exit-procs* '()) ;; add procs to this list to be executed on exit
(define *default-area-tag* "local")
;; DATABASE
-;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this.
-;; db stats
-(define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total >
-(define *db-stats-mutex* (make-mutex))
;; db access
(define *db-last-access* (current-seconds)) ;; last db access, used in server
;; (define *db-write-access* #t)
;; db sync
;; (define *db-last-sync* 0) ;; last time the sync to megatest.db happened
@@ -181,11 +178,10 @@
;; (define *max-cache-size* 0)
(define *logged-in-clients* (make-hash-table))
(define *server-id* #f)
(define *server-info* #f) ;; good candidate for easily convert to non-global
(define *time-to-exit* #f)
-(define *server-run* #t)
(define *run-id* #f)
(define *server-kind-run* (make-hash-table))
(define *home-host* #f)
;; (define *total-non-write-delay* 0)
(define *heartbeat-mutex* (make-mutex))
@@ -247,11 +243,11 @@
(define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
(define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*))
(define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*))
(define (common:get-sync-lock-filepath)
- (let* ((tmp-area (common:get-db-tmp-area))
+ (let* ((tmp-area (common:make-tmpdir-name *toppath* ""))
(lockfile (conc tmp-area "/megatest.db.lock")))
lockfile))
(define *common:logpro-exit-code->status-sym-alist*
'( ( 0 . pass )
@@ -1533,11 +1529,11 @@
;;
(define (common:lazy-modification-time fpath)
(handle-exceptions
exn
(begin
- (debug:print 0 *default-log-port* "Failed to get modifcation time for " fpath ", treating it as zero. exn=" exn)
+ (debug:print 2 *default-log-port* "Failed to get modification time for " fpath ", treating it as zero. exn=" exn)
0)
(if (file-exists? fpath)
(file-modification-time fpath)
0)))
@@ -2280,11 +2276,11 @@
(define (common:check-db-dir-space)
(let* ((required (string->number
;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks.
(or (configf:lookup *configdat* "setup" "dbdir-space-required")
"1000000")))
- (dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir))
+ (dbdir (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir))
(tdbspace (common:check-space-in-dir dbdir required))
(mdbspace (common:check-space-in-dir *toppath* required)))
(sort (list tdbspace mdbspace) (lambda (a b)
(< (cadr a)(cadr b))))))
Index: commonmod.scm
==================================================================
--- commonmod.scm
+++ commonmod.scm
@@ -160,10 +160,17 @@
'()))) ;; should it return empty list or #f to indicate not set?
(define (get-section cfgdat section)
(hash-table-ref/default cfgdat section '()))
+
+(define (common:make-tmpdir-name areapath tmpadj)
+ (let* ((area (pathname-file areapath))
+ (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb")))
+ (unless (directory-exists? dname)
+ (create-directory dname #t))
+ dname))
;; dot-locking egg seems not to work, using this for now
;; if lock is older than expire-time then remove it and try again
;; to get the lock
;;
Index: dashboard-tests.scm
==================================================================
--- dashboard-tests.scm
+++ dashboard-tests.scm
@@ -463,11 +463,11 @@
;;======================================================================
;;
;;======================================================================
(define (dashboard-tests:examine-test run-id test-id) ;; run-id run-key origtest)
- (let* ((db-path (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
+ (let* ((db-path (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db"))
(dbstruct #f) ;; NOT USED
(testdat (rmt:get-test-info-by-id run-id test-id)) ;; (db:get-test-info-by-id dbstruct run-id test-id))
(db-mod-time 0) ;; (file-modification-time db-path))
(last-update 0) ;; (current-seconds))
(request-update #t))
Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -402,12 +402,12 @@
(dboard:setup-tabdat dat)
(dboard:setup-num-rows dat)
dat))
(define (dboard:setup-tabdat tabdat)
- (dboard:tabdat-dbdir-set! tabdat (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
- (dboard:tabdat-dbfpath-set! tabdat (db:dbfile-path))
+ (dboard:tabdat-dbdir-set! tabdat (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
+ (dboard:tabdat-dbfpath-set! tabdat (common:make-tmpdir-name *toppath* ""))
(dboard:tabdat-monitor-db-path-set! tabdat (conc (dboard:tabdat-dbdir tabdat) "/monitor.db"))
;; HACK ALERT: this is a hack, please fix.
(dboard:tabdat-ro-set! tabdat (not (file-read-access? (dboard:tabdat-dbfpath tabdat))))
@@ -928,16 +928,16 @@
(hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
(if (or (null? tal)
(> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
(begin
(when (> elapsed-time 2)
- (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
+ (debug:print 2 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
(let* ((old-val (iup:attribute *tim* "TIME"))
(new-val (number->string (inexact->exact (floor (* 2 (string->number old-val)))))))
(if (< (string->number new-val) 5000)
(begin
- (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
+ (debug:print 2 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
(iup:attribute-set! *tim* "TIME" new-val)))))
(dboard:tabdat-allruns-set! tabdat new-res)
maxtests)
(if (> (dboard:rundat-run-data-offset run-struct) 0)
(loop run tal new-res newmaxtests) ;; not done getting data for this run
@@ -3112,11 +3112,11 @@
(debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: "
((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn)
(current-seconds)) ;; something went wrong - just print an error and return current-seconds
(common:max (map (lambda (filen)
(file-modification-time filen))
- (glob (conc dbdir "/*.db*"))))))
+ (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db")))))))
(define (dashboard:monitor-changed? commondat tabdat)
(let* ((run-update-time (current-seconds))
(monitor-db-path (dboard:tabdat-monitor-db-path tabdat))
(monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path))
Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -133,11 +133,11 @@
default)))
(apply sqlite3:first-result db stmt params)))
(define (db:setup do-sync)
(assert *toppath* "FATAL: db:setup called before launch:setup has been run.")
- (let* ((tmpdir (common:get-db-tmp-area)))
+ (let* ((tmpdir (common:make-tmpdir-name *toppath* "")))
(if (not *dbstruct-dbs*)
(dbfile:setup do-sync *toppath* tmpdir)
*dbstruct-dbs*)))
;; moved from dbfile
@@ -267,17 +267,10 @@
", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
", location: " ((condition-property-accessor 'exn 'location) exn)
))
-;; NB// #f => return dbdir only
-;; (was planned to be; zeroth db with name=main.db)
-;;
-;; If run-id is #f return to create and retrieve the path where the db will live.
-;;
-(define db:dbfile-path common:get-db-tmp-area)
-
(define (db:set-sync db)
(let ((syncprag (configf:lookup *configdat* "setup" "sychronous")))
(sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";"))))
@@ -467,11 +460,11 @@
(get-mtime shm-file))))
;; (define (db:all-db-sync dbstruct)
;; (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db))
;; (data-synced 0) ;; count of changed records
-;; (tmp-area (common:get-db-tmp-area))
+;; (tmp-area (common:make-tmpdir-name *toppath*))
;; (dbfiles (glob (conc tmp-area"/.mtdb/*.db")))
;; (sync-durations (make-hash-table))
;; (no-sync-db (db:open-no-sync-db)))
;; (for-each
;; (lambda (file) ;; tmp db file
@@ -556,11 +549,11 @@
;; run-ids: '(1 2 3 ...) or #f (for all)
;;
(define (db:multi-db-sync dbstruct . options)
(let* (;; (dbdat (db:open-db dbstruct #f dbfile:db-init-proc))
(data-synced 0) ;; count of changed records
- (tmp-area (common:get-db-tmp-area))
+ (tmp-area (common:make-tmpdir-name *toppath* ""))
(old2new (member 'old2new options))
(dejunk (member 'dejunk options))
(killservers (member 'killservers options))
(src-area (if old2new *toppath* tmp-area))
(dest-area (if old2new tmp-area *toppath*))
@@ -1254,11 +1247,11 @@
;; no-sync.db - small bits of data to be shared between servers
;;======================================================================
(define (db:get-dbsync-path)
(case (rmt:transport-mode)
- ((http)(common:get-db-tmp-area))
+ ((http)(common:make-tmpdir-name *toppath* ""))
((tcp) (conc *toppath*"/.mtdb"))
((nfs) (conc *toppath*"/.mtdb"))
(else "/tmp/dunno-this-gonna-exist")))
;; This is needed for api.scm
@@ -1580,11 +1573,11 @@
;; TODO: Switch this to use max(update_time) from each run db? Then if using a server there is no disk traffic (using cachedb db) ???
;;
;; NOTE: This DOESN'T (necessarily) get the real run ids, but the number of the .db!!
(define (db:get-changed-run-ids since-time)
- (let* ((dbdir (db:dbfile-path)) ;; (configf:lookup *configdat* "setup" "dbdir"))
+ (let* ((dbdir (common:make-tmpdir-name *toppath* "")) ;; (configf:lookup *configdat* "setup" "dbdir"))
(alldbs (glob (conc *toppath* "/.mtdb/[0-9]*.db*")))
(changed (filter (lambda (dbfile)
(> (file-modification-time dbfile) since-time))
alldbs)))
(delete-duplicates
@@ -2234,21 +2227,24 @@
qry
run-id
(or last-update 0))))))
(define (db:get-testinfo-state-status dbstruct run-id test-id)
- (let ((res #f))
- (db:with-db dbstruct run-id #f
- (lambda (dbdat db)
- (sqlite3:for-each-row
- (lambda (run-id testname item-path state status)
- ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment
- (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-")))
- db
- "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;"
- test-id run-id)))
- res))
+ (db:with-db
+ dbstruct run-id #f
+ (lambda (dbdat db)
+ (let* ((res #f)
+ (stmth (db:get-cache-stmth dbdat db "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;")))
+ (sqlite3:for-each-row
+ (lambda (run-id testname item-path state status)
+ ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment
+ (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-")))
+ ;; db
+ ;; "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;"
+ stmth
+ test-id run-id)
+ res))))
;; get a useful subset of the tests data (used in dashboard
;; use db:mintest-get-{id ,run_id,testname ...}
;;
(define (db:get-tests-for-run-mindata dbstruct run-id testpatt states statuses not-in)
@@ -2637,18 +2633,20 @@
(db:with-db
dbstruct
run-id
#f
(lambda (dbdat db)
- (let ((res (cons #f #f)))
-;; (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=?;")))
- (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
- (lambda (state status)
- (cons state status))
- db
- "SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue
- test-id run-id)
+ (let ((res (cons #f #f))
+ (stmth (db:get-cache-stmth dbdat db "SELECT state,status FROM tests WHERE id=? AND run_id=?;")))
+ (db:with-mutex-for-stmth
+ (lambda()
+ (sqlite3:for-each-row ;; attemptnum added to hold pid of top process (not Megatest) controlling a test
+ (lambda (state status)
+ (cons state status))
+ ;; db
+ stmth ;;"SELECT state,status FROM tests WHERE id=? AND run_id=?;" ;; stmth try not compiling this one - yes, this fixed the bind issue
+ test-id run-id)))
res))))
;; Use db:test-get* to access
;; Get test data using test_ids. NB// Only works within a single run!!
;;
@@ -3728,25 +3726,35 @@
tags)))
db
"SELECT testname,tags FROM test_meta")
(hash-table->alist res)))))
+;; testmeta doesn't change, we can cache it for up too an hour
+
+(define *db:testmeta-cache* (make-hash-table))
+(define *db:testmeta-last-update* 0)
+
;; read the record given a testname
(define (db:testmeta-get-record dbstruct testname)
- (let ((res #f))
- (db:with-db
- dbstruct
- #f
- #f
- (lambda (dbdat db)
- (sqlite3:for-each-row
- (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)
- (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)))
- db
- "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;"
- testname)
- res))))
+ (if (and (< (- (current-seconds) *db:testmeta-last-update*) 600)
+ (hash-table-exists? *db:testmeta-cache* testname))
+ (hash-table-ref *db:testmeta-cache* testname)
+ (let ((res #f))
+ (db:with-db
+ dbstruct
+ #f
+ #f
+ (lambda (dbdat db)
+ (sqlite3:for-each-row
+ (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)
+ (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup)))
+ db
+ "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;"
+ testname)))
+ (hash-table-set! *db:testmeta-cache* testname res)
+ (set! *db:testmeta-last-update* (current-seconds))
+ res)))
;; create a new record for a given testname
(define (db:testmeta-add-record dbstruct testname)
(db:with-db dbstruct #f #t
(lambda (dbdat db)
@@ -4314,11 +4322,11 @@
))))
;; sync for filesystem local db writes
;;
(define (db:run-lock-and-sync no-sync-db)
- (let* ((tmp-area (common:get-db-tmp-area))
+ (let* ((tmp-area (common:make-tmpdir-name *toppath* ""))
(dbfiles (glob (conc tmp-area"/.mtdb/*.db")))
(sync-durations (make-hash-table)))
;; (debug:print-info 0 *default-log-port* "lock-and-sync, dbfiles: "dbfiles)
(for-each
(lambda (file)
@@ -4370,11 +4378,11 @@
(sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
(debug-mode (debug:debug-mode 1))
(last-time (current-seconds)) ;; last time through the sync loop
(no-sync-db (db:open-no-sync-db))
(sync-duration 0) ;; run time of the sync in milliseconds
- (tmp-area (common:get-db-tmp-area)))
+ (tmp-area (common:make-tmpdir-name *toppath* "")))
;; Sync moved to http-transport keep-running loop
(debug:print-info 2 *default-log-port* "Periodic copy-based sync thread started. syncer is copy-sync, tmp-area is " tmp-area)
(debug:print-info 3 *default-log-port* "watchdog starting. syncer is copy-sync pid="(current-process-id));; " this-wd-num="this-wd-num)
(if (and legacy-sync (not *time-to-exit*))
@@ -4478,11 +4486,11 @@
(for-each
(lambda (subdb)
(let* (;;(dbstruct (db:setup))
(mtdb (dbr:subdb-mtdb subdb))
(mtpath (db:dbdat-get-path mtdb))
- (tmp-area (common:get-db-tmp-area))
+ (tmp-area (common:make-tmpdir-name *toppath* ""))
(res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
(set! sync-duration (- (current-milliseconds) sync-start))
(if (> res 0) ;; some records were transferred, keep the db alive
(begin
(mutex-lock! *heartbeat-mutex*)
@@ -4525,11 +4533,10 @@
;; ;; time to exit, close the no-sync db here
;; (db:no-sync-close-db no-sync-db stmt-cache)
(if (common:low-noise-print 30)
(debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) ))))
))
-
(define (std-exit-procedure)
;;(common:telemetry-log-close)
(on-exit (lambda () 0)) ;; why is this here?
;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -242,11 +242,12 @@
#f
)
)
(define (dbfile:make-tmpdir-name areapath tmpadj)
- (let* ((dname (conc "/tmp/"(current-user-name)"/" (string-translate areapath "/" ".") tmpadj)))
+ (let* ((area (pathname-file areapath))
+ (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb")))
(unless (directory-exists? dname)
(create-directory dname #t))
dname))
(define (dbfile:run-id->path apath run-id)
@@ -487,11 +488,11 @@
;; NOTE: this is already protected by mutex *no-sync-db-mutex*
;;
(define (dbfile:raw-open-no-sync-db dbpath)
(if (not (file-exists? dbpath))
(create-directory dbpath #t))
- (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db")
+ (debug:print-info 2 *default-log-port* "(dbfile:raw-open-no-sync-db: Opening "dbpath"/no-sync.db")
(let* ((dbname (conc dbpath "/no-sync.db"))
(db-exists (file-exists? dbname))
(init-proc (lambda (db)
(sqlite3:with-transaction
db
@@ -525,17 +526,18 @@
reason TEXT DEFAULT 'none',
CONSTRAINT no_sync_processes UNIQUE (host,pid));"
))))))
(on-tmp (equal? (car (string-split dbpath "/")) "tmp"))
(db (if on-tmp
- (dbfile:cautious-open-database dbname init-proc 0 "WAL" force-init: #t)
- (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t)
+ (dbfile:cautious-open-database dbname init-proc 1 "WAL" force-init: #t) ;; WAL MODE should use syncronous=1
+ ;; (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t)
+ (dbfile:cautious-open-database dbname init-proc 0 "MEMORY" force-init: #t)
;; (sqlite3:open-database dbname)
)))
(if on-tmp ;; done in cautious-open-database
(begin
- (sqlite3:execute db "PRAGMA synchronous = 0;")
+ ;; (sqlite3:execute db "PRAGMA synchronous = 0;") ;; why was this here when is is handled by cautious-open-database?
(sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000))))
db))
;; mtest processes registry calls
@@ -675,20 +677,26 @@
;; fails returns (#f lock-creation-time identifier)
;; succeeds (returns (#t lock-creation-time identifier)
;; use (db:no-sync-del! db keyname) to release the lock
;;
(define (db:no-sync-get-lock-with-id db keyname identifier)
+ (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: db: " db " keyname: " keyname " identifier: " identifier)
(sqlite3:with-transaction
db
(lambda ()
(condition-case
(let* ((curr-val (db:no-sync-get/default db keyname #f)))
+ (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: curr-val: " curr-val)
(if curr-val
(match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier
((timestamp . ident)
(cons (equal? ident identifier) timestamp))
- (else (cons #f 'malformed-lock))) ;; lock malformed
+ (else
+ (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: malformed lock")
+ (cons #f 'malformed-lock)
+ )
+ ) ;; lock malformed
(let ((curr-sec (current-seconds))
(lock-value (if identifier
(conc (current-seconds)"+"identifier)
(current-seconds))))
(sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value)
@@ -1572,7 +1580,15 @@
;; (db:hoh-set! stmt-cache db stmt newstmth)
(hash-table-set! stmt-cache stmt newstmth)
newstmth))))
(mutex-unlock! *get-cache-stmth-mutex*)
result))
+
+(define *mutex-stmth-call* (make-mutex))
+
+(define (db:with-mutex-for-stmth proc)
+ (mutex-lock! *mutex-stmth-call*)
+ (let* ((res (proc)))
+ (mutex-unlock! *mutex-stmth-call*)
+ res))
)
Index: dbmod.scm
==================================================================
--- dbmod.scm
+++ dbmod.scm
@@ -31,10 +31,11 @@
chicken
data-structures
extras
files
+ format
(prefix sqlite3 sqlite3:)
matchable
posix
typed-records
srfi-1
@@ -87,13 +88,13 @@
;; The cachedb one-db file per server method goes in here
;;======================================================================
;; NOTE: the r/w is now w/r, #t=db modified by query, #f=db NOT modified by query
(define (dbmod:with-db dbstruct run-id w/r proc params)
- (let* ((use-mutex (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk
- (> *api-process-request-count* 5)) ;; when writes are happening throttle more
- (> *api-process-request-count* 50)))
+ (let* ((use-mutex w/r) ;; (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk
+ ;; (> *api-process-request-count* 5)) ;; when writes are happening throttle more
+ ;; (> *api-process-request-count* 50)))
(dbdat (dbmod:open-db dbstruct run-id (dbfile:db-init-proc)))
(dbh (dbr:dbdat-dbh dbdat)) ;; this will be the cachedb handle
(dbfile (dbr:dbdat-dbfile dbdat)))
;; if nfs mode do a sync if delta > 2
(let* ((last-update (dbr:dbstruct-last-update dbstruct))
@@ -198,11 +199,11 @@
(let* ((dbstruct (or dbstruct-in (make-dbr:dbstruct areapath: areapath)))
(dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
(dbpath (dbmod:get-dbdir dbstruct)) ;; directory where all the .db files are kept
(dbfullname (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id))
(dbexists (file-exists? dbfullname))
- (tmpdir (dbfile:make-tmpdir-name areapath tmpadj))
+ (tmpdir (common:make-tmpdir-name areapath tmpadj))
(tmpdb (let* ((fname (conc tmpdir"/"dbfname)))
fname))
(cachedb (dbmod:open-cachedb-db init-proc
;; (if (eq? (dbfile:cache-method) 'cachedb)
;; #f
@@ -225,50 +226,21 @@
(dbr:dbstruct-dbfname-set! dbstruct dbfname)
(dbr:dbstruct-sync-proc-set! dbstruct
(lambda (last-update)
(if *sync-in-progress*
(debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk")
- (let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log"))
- (sync-cmd (if (eq? syncdir 'todisk)
- (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&")
- (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&")))
- (synclock-file (conc dbfullname".lock"))
- (syncer-running-file (conc dbfullname"-sync-running"))
- (synclock-mod-time (if (file-exists? synclock-file)
- (handle-exceptions
- exn
- #f
- (file-modification-time synclock-file))
- #f))
- (thethread (lambda ()
- (thread-start!
- (make-thread
- (lambda ()
- (set! *sync-in-progress* #t)
- (debug:print-info "Running "sync-cmd)
- (if (file-exists? syncer-running-file)
- (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.")
- (system sync-cmd))
- (set! *sync-in-progress* #f)))))))
- (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk
- (file-modification-time tmpdb)
- (file-modification-time dbfullname))
- (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname)
- (if synclock-mod-time
- (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file
- (begin
- (handle-exceptions
- exn
- #f
- (begin
- (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it")
- (delete-file synclock-file)
- )
- )
- (thethread))
- (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found."))
- (thethread)))))))
+ (begin
+ ;; turn off writes - send busy or block?
+ ;; call db2db internally
+ ;; turn writes back on
+ ;;
+ (set! *api-halt-writes* #t) ;; do we need a mutex?
+ ;; (dbmod:db-to-db-sync src-db dest-db last-update (dbfile:db-init-proc) keys)
+ (debug:print-info 2 *default-log-port* "Internal sync running from "tmpdb" to "dbfullname)
+ (dbmod:db-to-db-sync tmpdb dbfullname last-update (dbfile:db-init-proc) keys)
+ (set! *api-halt-writes* #f)
+ ))))
;; (dbmod:sync-tables tables #f db cachedb)
;;
(thread-sleep! 1) ;; let things settle before syncing in needed data
(dbmod:sync-gasket tables #f cachedb db dbfullname 'fromdest keys) ;; ) ;; load into cachedb
(dbr:dbstruct-last-update-set! dbstruct (+ (current-seconds) -10)) ;; should this be offset back in time by one second?
@@ -856,6 +828,94 @@
(res (dbmod:sync-gasket tables last-update sdb ddb dest-db 'todisk keys)))
(sqlite3:finalize! sdb)
(sqlite3:finalize! ddb)
res)))
#f))
+
+;; ======================================================================
+;; dbstats
+;;======================================================================
+
+;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this.
+;; db stats
+(define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total >
+(define *db-stats-mutex* (make-mutex))
+
+(define (rmt:print-db-stats)
+ (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
+ (debug:print 0 *default-log-port* "DB Stats\n========")
+ (debug:print 0 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
+ (for-each (lambda (cmd)
+ (let* ((dat (hash-table-ref *db-stats* cmd))
+ (count (dbstat-cnt dat))
+ (tottime (dbstat-tottime dat)))
+ (debug:print 0 *default-log-port*
+ (format #f fmtstr cmd count tottime
+ (/ tottime count)))))
+ (sort (hash-table-keys *db-stats*)
+ (lambda (a b)
+ (> (dbstat-tottime (hash-table-ref *db-stats* a))
+ (dbstat-tottime (hash-table-ref *db-stats* b))))))))
+
+(defstruct dbstat
+ (cnt 0)
+ (tottime 0))
+
+(define (db:add-stats cmd run-id params delta)
+ (let* ((modified-cmd (if (eq? cmd 'general-call)
+ (string->symbol (conc "general-call-" (car params)))
+ cmd))
+ (rec (hash-table-ref/default *db-stats* modified-cmd #f)))
+ (if (not rec)
+ (let ((new-rec (make-dbstat)))
+ (hash-table-set! *db-stats* modified-cmd new-rec)
+ (set! rec new-rec)))
+ (dbstat-cnt-set! rec (+ (dbstat-cnt rec) 1))
+ (dbstat-tottime-set! rec (+ (dbstat-tottime rec) delta))))
+
+
+
)
+
+
+;; ATTIC
+
+ #;(let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log"))
+ (sync-cmd (if (eq? syncdir 'todisk)
+ (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&")
+ (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&")))
+ (synclock-file (conc dbfullname".lock"))
+ (syncer-running-file (conc dbfullname"-sync-running"))
+ (synclock-mod-time (if (file-exists? synclock-file)
+ (handle-exceptions
+ exn
+ #f
+ (file-modification-time synclock-file))
+ #f))
+ (thethread (lambda ()
+ (thread-start!
+ (make-thread
+ (lambda ()
+ (set! *sync-in-progress* #t)
+ (debug:print-info "Running "sync-cmd)
+ (if (file-exists? syncer-running-file)
+ (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.")
+ (system sync-cmd))
+ (set! *sync-in-progress* #f)))))))
+ (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk
+ (file-modification-time tmpdb)
+ (file-modification-time dbfullname))
+ (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname)
+ (if synclock-mod-time
+ (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file
+ (begin
+ (handle-exceptions
+ exn
+ #f
+ (begin
+ (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it")
+ (delete-file synclock-file)
+ )
+ )
+ (thethread))
+ (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found."))
+ (thethread))))
Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -236,11 +236,10 @@
(let loop ((minutes (calc-minutes))
(cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(disk-free (get-df (current-directory)))
(last-sync (current-seconds)))
- ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
(let* ((over-time (> (current-seconds) (+ last-sync update-period)))
(new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(delta (abs (- load cpu-load))))
(if (> delta 0.1) ;; don't bother updating with small changes
load
@@ -256,15 +255,16 @@
(do-sync (or new-cpu-load new-disk-free over-time))
(test-info (rmt:get-test-state-status-by-id run-id test-id))
(state (car test-info));; (db:test-get-state test-info))
(status (cdr test-info));; (db:test-get-status test-info))
+ (killreq (equal? state "KILLREQ"))
(kill-reason "no kill reason specified")
(kill-job? #f))
;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
- ((test-get-kill-request run-id test-id)
+ (killreq
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
@@ -276,16 +276,11 @@
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty
(launch:handle-zombie-tests run-id))
(when do-sync
- ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
- ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
- ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
- (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
- ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds)))
- )
+ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
(if kill-job?
(begin
(debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
(mutex-lock! m)
@@ -331,18 +326,17 @@
)))
(mutex-unlock! m)
;; no point in sticking around. Exit now. But run end of run before exiting?
(launch:end-of-run-check run-id)
(exit)))
- (if (hash-table-ref/default misc-flags 'keep-going #f)
+ (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(begin
- (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses
- (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
- (loop (calc-minutes)
- (or new-cpu-load cpu-load)
- (or new-disk-free disk-free)
- (if do-sync (current-seconds) last-sync)))))))
+ (thread-sleep! 6) ;; was 3
+ (loop (calc-minutes)
+ (or new-cpu-load cpu-load)
+ (or new-disk-free disk-free)
+ (if do-sync (current-seconds) last-sync))))))
(tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
(define (launch:execute encoded-cmd)
(let* ((cmdinfo (common:read-encoded-string encoded-cmd))
Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
;; (declare (unit megatest-version))
-(define megatest-version 1.8017)
+(define megatest-version 1.8018)
Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -968,12 +968,13 @@
(tl (launch:setup))
(keys (keys:config-get-fields *configdat*)))
(case (rmt:transport-mode)
((tcp)
(let* ((timeout (server:expiration-timeout)))
- (debug:print 0 *default-log-port* "INFO: Running using tcp method with server timeout of "timeout)
+ (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout)
(tt-server-timeout-param timeout)
+ (thread-start! (make-thread api:print-db-stats "print-db-stats"))
(if dbfname
(tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)
(begin
(debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.")
(exit 1)))))
@@ -1091,10 +1092,14 @@
sfiles
)
)
)
dbfiles
+ )
+ ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id.
+ (if (file-exists? (conc *toppath* "/.mtdb/no-sync.db"))
+ (delete-file (conc *toppath* "/.mtdb/no-sync.db"))
)
(set! *didsomething* #t)
(exit)
)
)
@@ -2132,13 +2137,13 @@
(exit 1)))
(if (common:file-exists? (conc *toppath* "/megatest.db"))
(begin
(debug:print-info 1 *default-log-port* "File " (conc *toppath* "/megatest.db") " already exists. Please remove it before trying to replicate db")
(exit 1)))
- (if (and (common:get-db-tmp-area) (> (length (directory (common:get-db-tmp-area) #f)) 0))
+ (if (and (common:make-tmpdir-name *toppath* "") (> (length (directory (common:make-tmpdir-name *toppath* "") #f)) 0))
(begin
- (debug:print-info 1 *default-log-port* (common:get-db-tmp-area) " not empty. Please remove it before trying to replicate db")
+ (debug:print-info 1 *default-log-port* (common:make-tmpdir-name *toppath* "") " not empty. Please remove it before trying to replicate db")
(exit 1)))
;; check if timestamp
(let* ((source (args:get-arg "-source"))
(src (if (not (equal? (substring source 0 1) "/"))
(conc (current-directory) "/" source)
Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -68,82 +68,56 @@
;;======================================================================
(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
-;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
-;;
-(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
+;; how to make area-dat
+(define (rmt:set-ttdat areapath ttdat)
+ (if ttdat
+ ttdat
+ (let* ((newremote (make-and-init-remote areapath)))
+ (set! *ttdat* newremote)
+ newremote)))
+
+;; NB// area-dat replaced by ttdat
+;;
+(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f))
(assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
-
- (if (not (eq? (rmt:transport-mode) 'nfs))
- (begin
- (if (> attemptnum 2)
- (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
-
- (cond
- ((> attemptnum 2) (thread-sleep! 0.05))
- ((> attemptnum 10) (thread-sleep! 0.5))
- ((> attemptnum 20) (thread-sleep! 1)))
-
- ;; I'm turning this off, it may make sense to move it
- ;; into http-transport-handler
- (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))
- (begin
- (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.")
- (case (rmt:transport-mode)
- ((http)
- (server:run *toppath*)
- (thread-sleep! 3))
- (else
- (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server
- ))))))
-
- ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
- ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
- ;; 3. do the query, if on homehost use local access
- ;;
(let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
(areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
- (runremote (or area-dat
- *runremote*))
(attemptnum (+ 1 attemptnum))
- (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*))
+ (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*))
(testsuite (common:get-testsuite-name))
- (mtexe (common:find-local-megatest)))
-
- (case (rmt:transport-mode)
- ((http)(http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode))
- ((tcp) (tcp-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
- ((nfs) (nfs-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe))
- )))
-
-(define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
- (let* ((keys (common:get-fields *configdat*))
- (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard")))
- (api:dispatch-request dbstruct cmd run-id params)))
-
-(define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
- (if (not runremote)
- (let* ((newremote (make-and-init-remote areapath)))
- (set! *runremote* newremote)
- (set! runremote newremote)))
- (let* ((dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) ;;(dbfile:run-id->path areapath run-id)))
- (tt:handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))
-
-(define (rmt:print-db-stats)
- (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f"
- (debug:print 18 *default-log-port* "DB Stats\n========")
- (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg"))
- (for-each (lambda (cmd)
- (let ((cmd-dat (hash-table-ref *db-stats* cmd)))
- (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0))))))
- (sort (hash-table-keys *db-stats*)
- (lambda (a b)
- (> (vector-ref (hash-table-ref *db-stats* a) 0)
- (vector-ref (hash-table-ref *db-stats* b) 0)))))))
-
+ (mtexe (common:find-local-megatest))
+ (dbfname (conc (dbfile:run-id->dbnum run-id)".db"))
+ (ttdat (rmt:set-ttdat areapath ttdat))
+ (conn (tt:get-conn ttdat dbfname))
+ (is-main (equal? dbfname "main.db")) ;; why not (not run-id) ?
+ (server-start-proc (if is-main
+ #f
+ (lambda ()
+ ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname)
+ (rmt:start-server ;; tt:server-process-run
+ areapath
+ testsuite ;; (dbfile:testsuite-name)
+ mtexe
+ run-id)))))
+ ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it
+ ;; and if there is no conn we first send a request to the main.db server to start a
+ ;; server for the dbfname.
+ #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request
+ (begin
+ (server-start-proc)
+ (thread-sleep! 1)))
+ (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)))
+
+;; KEEP THIS HERE FOR A WHILE, WE MAY WANT TO RESURECT IT
+;; (define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
+;; (let* ((keys (common:get-fields *configdat*))
+;; (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard")))
+;; (api:dispatch-request dbstruct cmd run-id params)))
+
(define (rmt:get-max-query-average run-id)
(mutex-lock! *db-stats-mutex*)
(let* ((runkey (conc "run-id=" run-id " "))
(cmds (filter (lambda (x)
(substring-index runkey x))
@@ -167,11 +141,11 @@
(mutex-unlock! *db-stats-mutex*)
res))
(define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5))
(let* ((qry-is-write (not (member cmd api:read-only-queries)))
- (db-file-path (db:dbfile-path)) ;; 0))
+ (db-file-path (common:make-tmpdir-name *toppath* "")) ;; 0))
(dbstructs-local (db:setup #t))
(read-only (not (file-write-access? db-file-path)))
(start (current-milliseconds))
(resdat (if (not (and read-only qry-is-write))
(let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params))))
@@ -204,11 +178,11 @@
;; (rmt:update-db-stats run-id cmd params duration)
;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
(if qry-is-write
(let ((start-time (current-seconds)))
(mutex-lock! *db-multi-sync-mutex*)
-/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client)
+ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client)
(mutex-unlock! *db-multi-sync-mutex*)))))
res))
;;======================================================================
;;
@@ -221,12 +195,12 @@
;;======================================================================
(define (rmt:kill-server run-id)
(rmt:send-receive 'kill-server run-id (list run-id)))
-(define (rmt:start-server run-id)
- (rmt:send-receive 'start-server 0 (list run-id)))
+(define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server
+ (rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id)))
;;======================================================================
;; M I S C
;;======================================================================
@@ -235,16 +209,16 @@
;; This login does no retries under the hood - it acts a bit like a ping.
;; Deprecated for nmsg-transport.
;;
;; (define (rmt:login-no-auto-client-setup runremote)
-;; (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature))))
+;; (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature))))
;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host
(define (rmt:get-latest-host-load hostname)
- (rmt:send-receive 'get-latest-host-load 0 (list hostname)))
+ (rmt:send-receive 'get-latest-host-load #f (list hostname)))
(define (rmt:sdb-qry qry val run-id)
;; add caching if qry is 'getid or 'getstr
(rmt:send-receive 'sdb-qry run-id (list qry val)))
Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -2064,11 +2064,11 @@
;;
;; There is now a single call to runs:update-all-test_meta and this
;; per-test call is not needed. Given the delicacy of the move to
;; v1.55 this code is being left in place for the time being.
;;
- (if (not (hash-table-ref/default *test-meta-updated* test-name #f))
+ (if (not (hash-table-exists? *test-meta-updated* test-name))
(begin
(hash-table-set! *test-meta-updated* test-name #t)
(runs:update-test_meta test-name test-conf)))
;; itemdat => ((ripeness "overripe") (temperature "cool") (season "summer"))
Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -84,11 +84,11 @@
(tasks:open-db numretries (- numretries 1)))
(begin
(print-call-chain (current-error-port))
(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
(debug:print 5 *default-log-port* " exn=" (condition->list exn))))
- (let* ((dbpath (db:dbfile-path )) ;; (tasks:get-task-db-path))
+ (let* ((dbpath (common:make-tmpdir-name *toppath* "")) ;; (tasks:get-task-db-path))
(dbfile (conc dbpath "/monitor.db"))
(avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away
(exists (common:file-exists? dbpath))
(write-access (file-write-access? dbpath))
(mdb (cond ;; what the hek is *toppath* doing here?
Index: tcp-transportmod.scm
==================================================================
--- tcp-transportmod.scm
+++ tcp-transportmod.scm
@@ -111,10 +111,11 @@
;;
(define tt-server-timeout-param (make-parameter 600))
;; make ttdat visible
(define *server-info* #f)
+(define *server-run* #t)
(define (tt:make-remote areapath)
(make-tt areapath: areapath))
;; 1 ... or #f
@@ -125,33 +126,40 @@
(and (or (number? run-id)
(not run-id))
(equal? (dbfile:run-id->dbfname run-id) dbfname)))
(tcp-buffer-size 2048)
-;; (max-connections 4096)
+;; (max-connections 4096)
+
+(define (tt:get-conn ttdat dbfname)
+ (hash-table-ref/default (tt-conns ttdat) dbfname #f))
;; do all the busy work of finding and setting up conn for
;; connecting to a server
;;
-(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
+(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)
(assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
- (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
- (server-start-proc (lambda ()
- (tt:server-process-run
- (tt-areapath ttdat)
- testsuite ;; (dbfile:testsuite-name)
- (common:find-local-megatest)
- run-id))))
+ (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
+ (let* ((conn (tt:get-conn ttdat dbfname))
+ (server-start-proc (or server-start-proc
+ (lambda ()
+ (assert (equal? dbfname "main.db") ;; only main.db is started here
+ "FATAL: called server-start-proc for db other than main.db")
+ (tt:server-process-run
+ (tt-areapath ttdat)
+ testsuite ;; (dbfile:testsuite-name)
+ (common:find-local-megatest)
+ run-id)))))
(if conn
(begin
- ; (debug:print-info 0 *default-log-port* "already connected to the server")
+ (debug:print-info 2 *default-log-port* "already connected to a server")
conn) ;; we are already connected to the server
(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
(match sdat
((host port start-time server-id pid dbfname2 servinffile)
(assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
- ;(debug:print-info 0 *default-log-port* "in match servinffile:" servinffile)
+ (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
(let* ((host-port (conc host":"port))
(conn (make-tt-conn
host: host
port: port
host-port: host-port
@@ -162,35 +170,41 @@
pid: pid)))
;; verify we can talk to this server
(let* ((result (tt:timed-ping host port server-id))
(ping-res (car result))
(ping (cdr result)))
- (debug:print-info 0 *default-log-port* "ping time: " ping)
+ (debug:print-info 2 *default-log-port* "host " host " port " port " ping time: " ping " result " ping-res)
(case ping-res
((running)
+ (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table")
(hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good?
conn)
((starting)
(thread-sleep! 0.5)
- (tt:client-connect-to-server ttdat dbfname run-id testsuite))
+ (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect")
+ (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))
(else
(let* ((curr-secs (current-seconds)))
;; rm the (last server) would go here
(if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
(begin
(tt-last-serv-start-set! ttdat curr-secs)
- (server-start-proc))) ;; start server if 30 sec since last attempt
+ (server-start-proc))) ;; start server if 10 sec since last attempt
(thread-sleep! 1)
- (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
+ (debug:print-info 2 *default-log-port* "server ping result was neither running nor starting. Retrying connect")
+ (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
(else ;; no good server found, if haven't started server in > 5 secs, start another
- (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; BUG - grow this number really do not want to swamp the machine with servers
+ (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
(begin
- (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
+ (debug:print-info 0 *default-log-port* "Starting server for "dbfname)
(server-start-proc)
- (tt-last-serv-start-set! ttdat (current-seconds))))
+ (tt-last-serv-start-set! ttdat (current-seconds))
+ (thread-sleep! 3)
+ ))
(thread-sleep! 1)
- (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
+ (debug:print-info 0 *default-log-port* "Connect to server for " dbfname)
+ (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
(define (tt:timed-ping host port server-id)
(let* ((start-time (current-milliseconds))
(result (tt:ping host port server-id)))
(cons result (- (current-milliseconds) start-time))))
@@ -222,18 +236,18 @@
;; client side handler
;;
;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
-(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
- ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
- (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
+(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)
+ ;; connect-to-server will start a server if needed.
+ (let* ((areapath (tt-areapath ttdat))
+ (conn (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname
(if conn
;; have connection, call the server
(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
;; res is (status errmsg result meta)
- ; (debug:print 0 *default-log-port* "conn:" conn " res: " res)
(match res
((status errmsg result meta)
(if (list? meta)
(let* ((delay-wait (alist-ref 'delay-wait meta)))
(if (and (number? delay-wait)
@@ -241,35 +255,36 @@
(begin
(debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds")
(thread-sleep! delay-wait)))))
(case status
((busy) ;; result will be how long the server wants you to delay
- (let* ((dly (if (number? result) result 0.1)))
- (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, will try again in "dly" seconds.")
+ (let* ((raw-dly (if (number? result) result 0.1))
+ (dly (+ raw-dly (/ attemptnum 10)))) ;; (* raw-dly (/ attemptnum 2))))
+ (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1))
(thread-sleep! dly)
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)))
((loaded)
(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
- result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+ result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe))
(else
result)))
(else ;; did not receive properly formated result
- (if (not res) ;; tt:handler is telling us that communication failed
+ (if (not res) ;; tt:send-receive telling us that communication failed
(let* ((host (tt-conn-host conn))
(port (tt-conn-port conn))
;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
(pid (tt-conn-pid conn))
;;(servinf (tt-conn-servinf-file conn)))
(servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath)
- (hash-table-set! (tt-conns ttdat) dbfname #f)
+ (hash-table-set! (tt-conns ttdat) dbfname #f) ;; clear out the conn for this dbfname to force finding new server
(if (and servinf (file-exists? servinf))
(begin
(if (< attemptnum 10)
(begin
(thread-sleep! 0.5)
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
(begin
(debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
(if (and (file-exists? servinf)
(> (- (current-seconds)(file-modification-time servinf)) 60))
(begin
@@ -276,30 +291,30 @@
(debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
(handle-exceptions
exn
#f
(delete-file* servinf))
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
(begin
;; start server - addressed in client-connect-to-server
;; delay - addressed in client-connect-to-server
;; try again
(thread-sleep! 0.25) ;; dunno, I think this needs to be here
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))
))))
(begin ;; no server file, delay and try again
- (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
+ (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ")
(thread-sleep! 0.5)
- (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
+ (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))))
(begin ;; this case is where res is malformed. Probably should abort
(assert #f "FATAL: tt:handler received bad data "res)
;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
- ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)
+ ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)
)))))
(begin
(thread-sleep! 1) ;; no conn yet set up, give it a rest and try again
- (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))
+ (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)))))
(define (tt:bid-for-servership run-id)
#f)
;; gets server info and appends path to server file
@@ -464,14 +479,16 @@
;; Server viability is checked in keep-running. Blindly start and run here.
;;
(define (tt:start-server areapath run-id dbfname-in handler keys)
(assert areapath "FATAL: areapath not provided for tt:start-server")
;; is there already a server for this dbfile? Then exit.
+ (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in)
(let* ((ttdat (make-tt areapath: areapath))
(dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
(servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
- (if (> (length servers) 4)
+ (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname)
+ (if (> (length servers) 0)
(begin
(debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.")
(exit))
(let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys)))
(tt-handler-set! ttdat (handler dbstruct))
@@ -522,18 +539,17 @@
(let* ((db-locked-in #f)
(areapath (tt-areapath ttdat))
(nosyncdbpath (conc areapath"/.mtdb"))
(cleanup (lambda ()
(if (tt-cleanup-proc ttdat)
- ((tt-cleanup-proc ttdat)))
+ ((tt-cleanup-proc ttdat))) ;; removes .servinfo file
(dbfile:with-no-sync-db nosyncdbpath
(lambda (db)
(let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct)))
- (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
+ ;; (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname)
(db:no-sync-del! db dbfname)
- #;(if dbtmpname
- (delete-file dbtmpname))))))))
+ ))))))
(set! *server-info* ttdat)
(let loop ((count 0))
(if (> count 240)
(begin
(debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.")
@@ -548,10 +564,13 @@
;; in over ten seconds we exit
(thread-sleep! 0.05) ;; any real need for delay here?
(let loop ()
(let* ((servers (tt:get-server-info-sorted ttdat dbfname))
(ok (cond
+ ((not *server-run*)
+ (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.")
+ #f)
((null? servers) #f) ;; not ok
((equal? (list-ref (car servers) 6) ;; compare the servinfofile
(tt-servinf-file ttdat))
(let* ((res (if db-locked-in
#t
@@ -576,10 +595,11 @@
(if (and res (common:low-noise-print 120 "top server message"))
(debug:print-info 0 *default-log-port* "Keep running, I'm the top server for "
dbfname" on "(tt-host ttdat)":"(tt-port ttdat)))
res))
(else
+ ;; wrong servinfo file
(debug:print-info 0 *default-log-port* "I'm not the lead server: "servers)
(let* ((leadsrv (car servers)))
(match leadsrv
((host port startseconds server-id pid dbfname servinfofile)
(let* ((result (tt:timed-ping host port server-id))
@@ -616,12 +636,13 @@
(let* ((last-update (dbr:dbstruct-last-update dbstruct))
(curr-secs (current-seconds)))
(if (and (eq? (tt-state ttdat) 'running)
(> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db?
- (begin
- (set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds))
+ (let* ((sinfo-file (tt-servinf-file ttdat)))
+ ;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file)
+ (set! (file-modification-time sinfo-file) (current-seconds))
((dbr:dbstruct-sync-proc dbstruct) last-update)
(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
(begin
@@ -704,12 +725,28 @@
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other : files
;;
(define (tt:find-server areapath dbfname)
(let* ((servdir (tt:get-servinfo-dir areapath))
- (sfiles (glob (conc servdir"/*:"dbfname))))
- sfiles))
+ (sfiles (glob (conc servdir"/*:"dbfname)))
+ (goodfiles '()))
+
+ ;; filter the files here by looking in processes table (if we are not main.db)
+ ;; and or look at the time stamp on the servinfo file, a running server will
+ ;; touch the file every minute (again, this will only apply for main.db)
+ (for-each (lambda (fname)
+ (let* ((age (- (current-seconds)(file-modification-time fname))))
+ (if (> age 200) ;; can't trust it if over twenty seconds old
+ (begin
+ (debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname)
+ (handle-exceptions
+ exn
+ (debug:print 0 *default-log-port* "WARNING: error attempting to remove stale servinfo file "fname)
+ (delete-file fname))) ;;
+ (set! goodfiles (cons fname goodfiles)))))
+ sfiles)
+ goodfiles))
;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
;; example of what it's looking for in the log file:
;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4
;;
@@ -719,12 +756,13 @@
(dbprep-found 0)
(bad-dat (list #f #f #f #f #f #f logf)))
(let ((fdat (handle-exceptions
exn
(begin
- ;; WARNING: this is potentially dangerous to blanket ignore the errors
- (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))
+ ;; BUG, TODO: add err checking, for now blanket ignore the errors?
+ (debug:print-info 0 *default-log-port* "Unable to get server info from "logf
+ ", exn="(condition->list exn))
'()) ;; no idea what went wrong, call it a bad server, return empty list
(with-input-from-file logf read-lines))))
(if (null? fdat) ;; bad data, return bad-dat
bad-dat
(let loop ((inl (car fdat))
@@ -750,10 +788,17 @@
logf))
(else
(debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
bad-dat)))))))))
+(define *last-server-start* (make-hash-table))
+
+(define (tt:too-recent-server-start dbfname)
+ (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f)))
+ (and last-run-time
+ (< (- (current-seconds) last-run-time) 5))))
+
;; Given an area path, start a server process ### NOTE ### > file 2>&1
;; if the target-host is set
;; try running on that host
;; incidental: rotate logs in logs/ dir.
;;
@@ -760,51 +805,58 @@
(define (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
(assert areapath "FATAL: tt:server-process-run called without areapath defined.")
(assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
(assert mtexe "FATAL: tt:server-process-run called without mtexe defined.")
;; mtest -server - -m testsuite:ext-tests -db 6.db
- (let* ((dbfname (dbmod:run-id->dbfname run-id))
- (load (get-normalized-cpu-load))
- (trying (length (tt:find-server areapath dbfname)))
- (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
- (cond
- ((> load 2.0)
- (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.")
- (thread-sleep! 1))
- ((> nrun 100)
- (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
- (thread-sleep! 1))
- ((> trying 4)
- (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
- (thread-sleep! 1))
- (else
- (if (not (file-exists? (conc areapath"/logs")))
- (create-directory (conc areapath"/logs") #t))
- (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
- (cmdln (conc
- mtexe
- " -startdir "areapath
- " -server - ";; (or target-host "-")
- " -m testsuite:"testsuite
- " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
- " " profile-mode
- (conc " >> " logfile " 2>&1 &"))))
- ;; we want the remote server to start in *toppath* so push there
- ;; (push-directory areapath) ;; use cd in the command line instead
- (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
- ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
-
- (system cmdln)
- ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
- ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
- ;; (setenv "NBFAKE_LOG" logfile)
- ;; (system (conc "cd "areapath" ; nbfake " cmdln))
- ;; (unsetenv "NBFAKE_QUIET")
- ;; (unsetenv "NBFAKE_LOG")
-
- ;;(pop-directory)
- )))))
+ (let* ((dbfname (dbmod:run-id->dbfname run-id)))
+ (if (tt:too-recent-server-start dbfname)
+ #f
+ (let* ((load (get-normalized-cpu-load))
+ (srvrs (tt:find-server areapath dbfname))
+ (trying (length srvrs))
+ (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
+ (cond
+ ((> load 2.0)
+ (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
+ (thread-sleep! 1)
+ #f)
+ ((> nrun 100)
+ (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
+ (thread-sleep! 1)
+ #f)
+ ((> trying 2)
+ (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
+ (thread-sleep! 1)
+ #f)
+ (else
+ (if (not (file-exists? (conc areapath"/logs")))
+ (create-directory (conc areapath"/logs") #t))
+ (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
+ (cmdln (conc
+ mtexe
+ " -startdir "areapath
+ " -server - ";; (or target-host "-")
+ " -m testsuite:"testsuite
+ " -db "dbfname ;; (dbmod:run-id->dbfname run-id)
+ " " profile-mode
+ (conc " >> " logfile " 2>&1 &"))))
+ ;; we want the remote server to start in *toppath* so push there
+ ;; (push-directory areapath) ;; use cd in the command line instead
+ (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
+ ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
+
+ (system cmdln)
+ (hash-table-set! *last-server-start* dbfname (current-seconds))
+ ;; ;; use below to go back to nbfake - nbfake does cause trouble ...
+ ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
+ ;; (setenv "NBFAKE_LOG" logfile)
+ ;; (system (conc "cd "areapath" ; nbfake " cmdln))
+ ;; (unsetenv "NBFAKE_QUIET")
+ ;; (unsetenv "NBFAKE_LOG")
+
+ ;;(pop-directory)
+ #t)))))))
;;======================================================================
;; tcp connection stuff
;;======================================================================
Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1964,11 +1964,12 @@
;; test steps
;;======================================================================
;; teststep-set-status! used to be here
-(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
+;; NOT NEEDED
+#;(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)
(let* ((testdat (rmt:get-test-state-status-by-id run-id test-id)))
(and testdat
(equal? (car testdat) "KILLREQ"))))
(define (test:tdb-get-rundat-count tdb)
Index: utils/mt_xterm
==================================================================
--- utils/mt_xterm
+++ utils/mt_xterm
@@ -20,18 +20,16 @@
MT_TMPDISPLAY=$DISPLAY
MT_TMPUSER=$USER
MT_HOME=$HOME
tmpfile=`mktemp`
-
-grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile
-source $tmpfile
-rm $tmpfile
-
-# if [ -e megatest.sh ];then
-#source megatest.sh
-#fi
+if [[ -e megatest.sh ]]; then
+ grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile
+ source $tmpfile
+ rm $tmpfile
+fi
+
export DISPLAY=$MT_TMPDISPLAY
export USER=$USER
export HOME=$MT_HOME
if [ x"$MT_XTERM_CMD" == "x" ];then