Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,18 @@
# along with Megatest. If not, see .
TODO
====
+23WW45 - the revolution branch
+. Add "fast" db start option (no handshaking over NFS)
+. Add server-ro to server types (just "server" is fine for read/write).
+. Create pause-server and resume-server calls
+. Create rsync or cp sync to MTRAH function
+. Change rmt:send-receive to divert calls to read-only server when possible
+. Change start server to call main.db server for 1..N.db servers, block until server is read for use.
+
23WW21
. Dashboard needs its own cache db in /tmp
23WW07
. Remove use of *dbstruct-dbs*
Index: archive.scm
==================================================================
--- archive.scm
+++ archive.scm
@@ -359,11 +359,11 @@
(archive-dir (if archive-info (cdr archive-info) #f))
(archive-id (if archive-info (car archive-info) -1))
(home-host (server:choose-server *toppath* 'homehost))
(archive-time (seconds->std-time-str (current-seconds)))
(archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time))
- (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db"))
+ (tmp-db-path (conc (dbfile:make-tmpdir-name *toppath* "") "/megatest.db"))
(dbfile (conc archive-staging-db "/megatest.db")))
(create-directory archive-staging-db #t)
(let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix)))
(if (eq? exit-code 0)
(case archiver
Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -247,11 +247,11 @@
(define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
(define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*))
(define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*))
(define (common:get-sync-lock-filepath)
- (let* ((tmp-area (common:get-db-tmp-area))
+ (let* ((tmp-area (dbfile:make-tmpdir-name *toppath* ""))
(lockfile (conc tmp-area "/megatest.db.lock")))
lockfile))
(define *common:logpro-exit-code->status-sym-alist*
'( ( 0 . pass )
@@ -2280,11 +2280,11 @@
(define (common:check-db-dir-space)
(let* ((required (string->number
;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks.
(or (configf:lookup *configdat* "setup" "dbdir-space-required")
"1000000")))
- (dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir))
+ (dbdir (dbfile:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir))
(tdbspace (common:check-space-in-dir dbdir required))
(mdbspace (common:check-space-in-dir *toppath* required)))
(sort (list tdbspace mdbspace) (lambda (a b)
(< (cadr a)(cadr b))))))
Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -402,12 +402,12 @@
(dboard:setup-tabdat dat)
(dboard:setup-num-rows dat)
dat))
(define (dboard:setup-tabdat tabdat)
- (dboard:tabdat-dbdir-set! tabdat (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
- (dboard:tabdat-dbfpath-set! tabdat (db:dbfile-path))
+ (dboard:tabdat-dbdir-set! tabdat (dbfile:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db"))
+ (dboard:tabdat-dbfpath-set! tabdat (dbfile:make-tmpdir-name *toppath* ""))
(dboard:tabdat-monitor-db-path-set! tabdat (conc (dboard:tabdat-dbdir tabdat) "/monitor.db"))
;; HACK ALERT: this is a hack, please fix.
(dboard:tabdat-ro-set! tabdat (not (file-read-access? (dboard:tabdat-dbfpath tabdat))))
Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -133,11 +133,11 @@
default)))
(apply sqlite3:first-result db stmt params)))
(define (db:setup do-sync)
(assert *toppath* "FATAL: db:setup called before launch:setup has been run.")
- (let* ((tmpdir (common:get-db-tmp-area)))
+ (let* ((tmpdir (dbfile:make-tmpdir-name *toppath* "")))
(if (not *dbstruct-dbs*)
(dbfile:setup do-sync *toppath* tmpdir)
*dbstruct-dbs*)))
;; moved from dbfile
@@ -267,17 +267,10 @@
", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
", location: " ((condition-property-accessor 'exn 'location) exn)
))
-;; NB// #f => return dbdir only
-;; (was planned to be; zeroth db with name=main.db)
-;;
-;; If run-id is #f return to create and retrieve the path where the db will live.
-;;
-(define db:dbfile-path common:get-db-tmp-area)
-
(define (db:set-sync db)
(let ((syncprag (configf:lookup *configdat* "setup" "sychronous")))
(sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";"))))
@@ -467,11 +460,11 @@
(get-mtime shm-file))))
;; (define (db:all-db-sync dbstruct)
;; (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db))
;; (data-synced 0) ;; count of changed records
-;; (tmp-area (common:get-db-tmp-area))
+;; (tmp-area (dbfile:make-tmpdir-name *toppath*))
;; (dbfiles (glob (conc tmp-area"/.mtdb/*.db")))
;; (sync-durations (make-hash-table))
;; (no-sync-db (db:open-no-sync-db)))
;; (for-each
;; (lambda (file) ;; tmp db file
@@ -556,11 +549,11 @@
;; run-ids: '(1 2 3 ...) or #f (for all)
;;
(define (db:multi-db-sync dbstruct . options)
(let* (;; (dbdat (db:open-db dbstruct #f dbfile:db-init-proc))
(data-synced 0) ;; count of changed records
- (tmp-area (common:get-db-tmp-area))
+ (tmp-area (dbfile:make-tmpdir-name *toppath* ""))
(old2new (member 'old2new options))
(dejunk (member 'dejunk options))
(killservers (member 'killservers options))
(src-area (if old2new *toppath* tmp-area))
(dest-area (if old2new tmp-area *toppath*))
@@ -1254,11 +1247,11 @@
;; no-sync.db - small bits of data to be shared between servers
;;======================================================================
(define (db:get-dbsync-path)
(case (rmt:transport-mode)
- ((http)(common:get-db-tmp-area))
+ ((http)(dbfile:make-tmpdir-name *toppath* ""))
((tcp) (conc *toppath*"/.mtdb"))
((nfs) (conc *toppath*"/.mtdb"))
(else "/tmp/dunno-this-gonna-exist")))
;; This is needed for api.scm
@@ -1580,11 +1573,11 @@
;; TODO: Switch this to use max(update_time) from each run db? Then if using a server there is no disk traffic (using cachedb db) ???
;;
;; NOTE: This DOESN'T (necessarily) get the real run ids, but the number of the .db!!
(define (db:get-changed-run-ids since-time)
- (let* ((dbdir (db:dbfile-path)) ;; (configf:lookup *configdat* "setup" "dbdir"))
+ (let* ((dbdir (dbfile:make-tmpdir-name *toppath* "")) ;; (configf:lookup *configdat* "setup" "dbdir"))
(alldbs (glob (conc *toppath* "/.mtdb/[0-9]*.db*")))
(changed (filter (lambda (dbfile)
(> (file-modification-time dbfile) since-time))
alldbs)))
(delete-duplicates
@@ -4314,11 +4307,11 @@
))))
;; sync for filesystem local db writes
;;
(define (db:run-lock-and-sync no-sync-db)
- (let* ((tmp-area (common:get-db-tmp-area))
+ (let* ((tmp-area (dbfile:make-tmpdir-name *toppath* ""))
(dbfiles (glob (conc tmp-area"/.mtdb/*.db")))
(sync-durations (make-hash-table)))
;; (debug:print-info 0 *default-log-port* "lock-and-sync, dbfiles: "dbfiles)
(for-each
(lambda (file)
@@ -4370,11 +4363,11 @@
(sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300))
(debug-mode (debug:debug-mode 1))
(last-time (current-seconds)) ;; last time through the sync loop
(no-sync-db (db:open-no-sync-db))
(sync-duration 0) ;; run time of the sync in milliseconds
- (tmp-area (common:get-db-tmp-area)))
+ (tmp-area (dbfile:make-tmpdir-name *toppath* "")))
;; Sync moved to http-transport keep-running loop
(debug:print-info 2 *default-log-port* "Periodic copy-based sync thread started. syncer is copy-sync, tmp-area is " tmp-area)
(debug:print-info 3 *default-log-port* "watchdog starting. syncer is copy-sync pid="(current-process-id));; " this-wd-num="this-wd-num)
(if (and legacy-sync (not *time-to-exit*))
@@ -4478,11 +4471,11 @@
(for-each
(lambda (subdb)
(let* (;;(dbstruct (db:setup))
(mtdb (dbr:subdb-mtdb subdb))
(mtpath (db:dbdat-get-path mtdb))
- (tmp-area (common:get-db-tmp-area))
+ (tmp-area (dbfile:make-tmpdir-name *toppath* ""))
(res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive
(set! sync-duration (- (current-milliseconds) sync-start))
(if (> res 0) ;; some records were transferred, keep the db alive
(begin
(mutex-lock! *heartbeat-mutex*)
Index: dbfile.scm
==================================================================
--- dbfile.scm
+++ dbfile.scm
@@ -242,11 +242,12 @@
#f
)
)
(define (dbfile:make-tmpdir-name areapath tmpadj)
- (let* ((dname (conc "/tmp/"(current-user-name)"/" (string-translate areapath "/" ".") tmpadj)))
+ (let* ((area (pathname-file areapath))
+ (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb")))
(unless (directory-exists? dname)
(create-directory dname #t))
dname))
(define (dbfile:run-id->path apath run-id)
@@ -487,11 +488,11 @@
;; NOTE: this is already protected by mutex *no-sync-db-mutex*
;;
(define (dbfile:raw-open-no-sync-db dbpath)
(if (not (file-exists? dbpath))
(create-directory dbpath #t))
- (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db")
+ (debug:print-info 2 *default-log-port* "(dbfile:raw-open-no-sync-db: Opening "dbpath"/no-sync.db")
(let* ((dbname (conc dbpath "/no-sync.db"))
(db-exists (file-exists? dbname))
(init-proc (lambda (db)
(sqlite3:with-transaction
db
@@ -675,20 +676,26 @@
;; fails returns (#f lock-creation-time identifier)
;; succeeds (returns (#t lock-creation-time identifier)
;; use (db:no-sync-del! db keyname) to release the lock
;;
(define (db:no-sync-get-lock-with-id db keyname identifier)
+ (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: db: " db " keyname: " keyname " identifier: " identifier)
(sqlite3:with-transaction
db
(lambda ()
(condition-case
(let* ((curr-val (db:no-sync-get/default db keyname #f)))
+ (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: curr-val: " curr-val)
(if curr-val
(match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier
((timestamp . ident)
(cons (equal? ident identifier) timestamp))
- (else (cons #f 'malformed-lock))) ;; lock malformed
+ (else
+ (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: malformed lock")
+ (cons #f 'malformed-lock)
+ )
+ ) ;; lock malformed
(let ((curr-sec (current-seconds))
(lock-value (if identifier
(conc (current-seconds)"+"identifier)
(current-seconds))))
(sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value)
Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -968,11 +968,11 @@
(tl (launch:setup))
(keys (keys:config-get-fields *configdat*)))
(case (rmt:transport-mode)
((tcp)
(let* ((timeout (server:expiration-timeout)))
- (debug:print 0 *default-log-port* "INFO: Running using tcp method with server timeout of "timeout)
+ (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout)
(tt-server-timeout-param timeout)
(if dbfname
(tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys)
(begin
(debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.")
@@ -1092,10 +1092,12 @@
)
)
)
dbfiles
)
+ ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id.
+ (delete-file (conc *toppath* "/.mtdb/no-sync.db"))
(set! *didsomething* #t)
(exit)
)
)
@@ -2132,13 +2134,13 @@
(exit 1)))
(if (common:file-exists? (conc *toppath* "/megatest.db"))
(begin
(debug:print-info 1 *default-log-port* "File " (conc *toppath* "/megatest.db") " already exists. Please remove it before trying to replicate db")
(exit 1)))
- (if (and (common:get-db-tmp-area) (> (length (directory (common:get-db-tmp-area) #f)) 0))
+ (if (and (dbfile:make-tmpdir-name *toppath* "") (> (length (directory (dbfile:make-tmpdir-name *toppath* "") #f)) 0))
(begin
- (debug:print-info 1 *default-log-port* (common:get-db-tmp-area) " not empty. Please remove it before trying to replicate db")
+ (debug:print-info 1 *default-log-port* (dbfile:make-tmpdir-name *toppath* "") " not empty. Please remove it before trying to replicate db")
(exit 1)))
;; check if timestamp
(let* ((source (args:get-arg "-source"))
(src (if (not (equal? (substring source 0 1) "/"))
(conc (current-directory) "/" source)
Index: tcp-transportmod.scm
==================================================================
--- tcp-transportmod.scm
+++ tcp-transportmod.scm
@@ -132,26 +132,27 @@
;; do all the busy work of finding and setting up conn for
;; connecting to a server
;;
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
(assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
+ (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
(let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
(server-start-proc (lambda ()
(tt:server-process-run
(tt-areapath ttdat)
testsuite ;; (dbfile:testsuite-name)
(common:find-local-megatest)
run-id))))
(if conn
(begin
- ; (debug:print-info 0 *default-log-port* "already connected to the server")
+ (debug:print-info 2 *default-log-port* "already connected to a server")
conn) ;; we are already connected to the server
(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
(match sdat
((host port start-time server-id pid dbfname2 servinffile)
(assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
- ;(debug:print-info 0 *default-log-port* "in match servinffile:" servinffile)
+ (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
(let* ((host-port (conc host":"port))
(conn (make-tt-conn
host: host
port: port
host-port: host-port
@@ -162,34 +163,38 @@
pid: pid)))
;; verify we can talk to this server
(let* ((result (tt:timed-ping host port server-id))
(ping-res (car result))
(ping (cdr result)))
- (debug:print-info 0 *default-log-port* "ping time: " ping)
+ (debug:print-info 2 *default-log-port* "host " host " port " port " ping time: " ping " result " ping-res)
(case ping-res
((running)
+ (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table")
(hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good?
conn)
((starting)
(thread-sleep! 0.5)
+ (debug:print-info 2 *default-log-port* "server was in starting state, retrying connect")
(tt:client-connect-to-server ttdat dbfname run-id testsuite))
(else
(let* ((curr-secs (current-seconds)))
;; rm the (last server) would go here
(if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
(begin
(tt-last-serv-start-set! ttdat curr-secs)
(server-start-proc))) ;; start server if 30 sec since last attempt
(thread-sleep! 1)
+ (debug:print-info 2 *default-log-port* "server ping result was neither running nor starting. Retrying connect")
(tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
(else ;; no good server found, if haven't started server in > 5 secs, start another
(if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; BUG - grow this number really do not want to swamp the machine with servers
(begin
(debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
(server-start-proc)
(tt-last-serv-start-set! ttdat (current-seconds))))
(thread-sleep! 1)
+ (debug:print-info 2 *default-log-port* "no good server found, try connect again")
(tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
(define (tt:timed-ping host port server-id)
(let* ((start-time (current-milliseconds))
(result (tt:ping host port server-id)))
@@ -223,10 +228,11 @@
;; client side handler
;;
;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
+ (debug:print 2 *default-log-port* "tt:handler cmd: " cmd " run-id: " run-id " attemptnum: " attemptnum)
;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
(let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f)))
(if conn
;; have connection, call the server
(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
@@ -464,10 +470,11 @@
;; Server viability is checked in keep-running. Blindly start and run here.
;;
(define (tt:start-server areapath run-id dbfname-in handler keys)
(assert areapath "FATAL: areapath not provided for tt:start-server")
;; is there already a server for this dbfile? Then exit.
+ (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in)
(let* ((ttdat (make-tt areapath: areapath))
(dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))
(servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead
(if (> (length servers) 4)
(begin