Overview
Comment: | reworked the sync locking, allow parallel servers (needs work to lock in to single machine) |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.80 |
Files: | files | file ages | folders |
SHA1: |
7fb44b797e70c15c101f361a2d7c61f4 |
User & Date: | matt on 2023-05-11 09:08:14 |
Other Links: | branch diff | manifest | tags |
Context
2023-05-11
| ||
09:14 | Merged fork check-in: b21843154b user: mrwellan tags: v1.80 | |
09:08 | reworked the sync locking, allow parallel servers (needs work to lock in to single machine) check-in: 7fb44b797e user: matt tags: v1.80 | |
05:43 | Use a key for the db lock-down that is unique to the db in .mtdb, this should elminate duplicate, overlapping servers. check-in: 138a40d18e user: matt tags: v1.80 | |
Changes
Modified common.scm from [10e4ec655c] to [6b6394cde8].
︙ | ︙ | |||
55 56 57 58 59 60 61 62 63 64 65 66 67 68 | (define (stop-the-train) (thread-start! (make-thread (lambda () (let loop () (if (and *toppath* (file-exists? (conc *toppath*"/stop-the-train"))) (let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately"))) (print msg) (debug:print 0 *default-log-port* msg) (exit 1))) (thread-sleep! 5) (loop)))))) ;; execute thunk, return value. If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* . | > | 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | (define (stop-the-train) (thread-start! (make-thread (lambda () (let loop () (if (and *toppath* (file-exists? (conc *toppath*"/stop-the-train"))) (let* ((msg (conc "ERROR: found file "*toppath*"/stop-the-train, exiting immediately"))) ;; yes, print to current-output-port AND *default-log-port*, annoying but necessary I think (print msg) (debug:print 0 *default-log-port* msg) (exit 1))) (thread-sleep! 5) (loop)))))) ;; execute thunk, return value. If exception thrown, trap exception, return #f, and emit nonfatal condition note to *default-log-port* . |
︙ | ︙ |
Modified dbfile.scm from [82b1ce2a6f] to [deb13d8b08].
︙ | ︙ | |||
80 81 82 83 84 85 86 87 88 89 90 91 92 93 | ;; ;; for the inmem approach (see dbmod.scm) ;; this is one db per server (inmem #f) ;; handle for the in memory copy (dbfile #f) ;; path to the db file on disk (dbfname #f) ;; short name of db file on disk (used to validate accessing correct db) (ondiskdb #f) ;; handle for the on-disk file (dbdat #f) ;; create a dbdat for the downstream calls such as db:with-db (last-update 0) (sync-proc #f) ) ;; NOTE: Need one dbr:subdb per main.db, 1.db ... ;; | > | 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | ;; ;; for the inmem approach (see dbmod.scm) ;; this is one db per server (inmem #f) ;; handle for the in memory copy (dbfile #f) ;; path to the db file on disk (dbfname #f) ;; short name of db file on disk (used to validate accessing correct db) (ondiskdb #f) ;; handle for the on-disk file (dbtmpname #f) ;; path to db file in /tmp (non-imem method) (dbdat #f) ;; create a dbdat for the downstream calls such as db:with-db (last-update 0) (sync-proc #f) ) ;; NOTE: Need one dbr:subdb per main.db, 1.db ... ;; |
︙ | ︙ | |||
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 | (string->number res) #f))) (if newres newres res)) res))) (define (db:extract-time-identifier instr) (let ((tokens (string-split instr "+"))) (match tokens ((t i)(cons (string->number t) i)) ((t) (cons (string->number t) #f)) (else (assert #f "FATAL: db:extract-time-identifier handed bad data "instr))))) ;; transaction protected lock aquisition ;; either: | > > | | | | | > | | | 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 | (string->number res) #f))) (if newres newres res)) res))) ;; timestring+identifier+payload ;; locks are unique on identifier, payload is informational (define (db:extract-time-identifier instr) (let ((tokens (string-split instr "+"))) (match tokens ((t i)(cons (string->number t) i)) ((t) (cons (string->number t) #f)) (else (assert #f "FATAL: db:extract-time-identifier handed bad data "instr))))) ;; transaction protected lock aquisition ;; either: ;; fails returns (#f lock-creation-time identifier) ;; succeeds (returns (#t lock-creation-time identifier) ;; use (db:no-sync-del! db keyname) to release the lock ;; (define (db:no-sync-get-lock-with-id db keyname identifier) (sqlite3:with-transaction db (lambda () (condition-case (let* ((curr-val (db:no-sync-get/default db keyname #f))) (if curr-val (match (db:extract-time-identifier curr-val) ;; result->timestamp, identifier ((timestamp ident) (if (equal? ident identifier) (cons #t timestamp) ;; this *is* my lock (cons #f timestamp))) ;; nope, not my lock (else (cons #f #f))) ;; nope, not my lock (let ((curr-sec (current-seconds)) (lock-value (if identifier (conc (current-seconds)"+"identifier) (current-seconds)))) (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value) (cons #t curr-sec)))) (exn (io-error) (dbfile:print-err "ERROR: i/o error with no-sync db. Check permissions, disk space etc. and try again.")) (exn (corrupt) (dbfile:print-err "ERROR: database no-sync db is corrupt. Repair it to proceed.")) (exn (busy) (dbfile:print-err "ERROR: database no-sync db is locked. Try copying to another location, remove original and copy back.")) (exn (permission)(dbfile:print-err "ERROR: database no-sync db has some permissions problem.")) (exn () ;; (status done) ;; I don't know how to detect status done but no data! (dbfile:print-err "ERROR: Unknown error with database no-sync db message: exn="(condition->list exn)", \n" ((condition-property-accessor 'exn 'message) exn)) |
︙ | ︙ |
Modified dbmod.scm from [fa16c38514] to [a52cf6d3b4].
︙ | ︙ | |||
95 96 97 98 99 100 101 | (dbdat (dbmod:open-db dbstruct run-id (dbfile:db-init-proc))) (dbh (dbr:dbdat-dbh dbdat)) ;; this will be the inmem handle (dbfile (dbr:dbdat-dbfile dbdat))) ;; if nfs mode do a sync if delta > 2 (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (sync-proc (dbr:dbstruct-sync-proc dbstruct)) (curr-secs (current-seconds))) | | | 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | (dbdat (dbmod:open-db dbstruct run-id (dbfile:db-init-proc))) (dbh (dbr:dbdat-dbh dbdat)) ;; this will be the inmem handle (dbfile (dbr:dbdat-dbfile dbdat))) ;; if nfs mode do a sync if delta > 2 (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (sync-proc (dbr:dbstruct-sync-proc dbstruct)) (curr-secs (current-seconds))) (if (> (- curr-secs last-update) 5) (begin (sync-proc last-update) ;; MOVE THIS CALL TO INSIDE THE sync-proc CALL (dbr:dbstruct-last-update-set! dbstruct curr-secs) ))) (assert (sqlite3:database? dbh) "FATAL: bad db handle in dbmod:with-db") |
︙ | ︙ | |||
176 177 178 179 180 181 182 | (syncdir 'todisk)) (let* ((dbstruct (or dbstruct-in (make-dbr:dbstruct areapath: areapath))) (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (dbpath (dbmod:get-dbdir dbstruct)) ;; directory where all the .db files are kept (dbfullname (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id)) (dbexists (file-exists? dbfullname)) (tmpdir (conc "/tmp/"(current-user-name))) | | > | 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 | (syncdir 'todisk)) (let* ((dbstruct (or dbstruct-in (make-dbr:dbstruct areapath: areapath))) (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (dbpath (dbmod:get-dbdir dbstruct)) ;; directory where all the .db files are kept (dbfullname (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id)) (dbexists (file-exists? dbfullname)) (tmpdir (conc "/tmp/"(current-user-name))) (tmpdb (let* ((fname (conc tmpdir"/" (string-translate areapath "/" ".") ;; "-"(current-process-id) "-"dbfname))) (if (not (file-exists? tmpdir))(create-directory tmpdir)) ;; check if tmpdb already exists, either delete it or ;; add something to the name fname)) (inmem (dbmod:open-inmem-db init-proc ;; (if (eq? (dbfile:cache-method) 'inmem) ;; #f |
︙ | ︙ | |||
199 200 201 202 203 204 205 206 207 208 209 210 | (begin (debug:print 0 *default-log-port* "ERROR: Failed to properly open "dbfname-in", exiting immediately.") (exit))) ;; (assert (sqlite3:database? inmem) "FATAL: open-dbmoddb: inmem is not a db") ;; (assert (sqlite3:database? db) "FATAL: open-dbmoddb: db is not a db") (dbr:dbstruct-inmem-set! dbstruct inmem) (dbr:dbstruct-ondiskdb-set! dbstruct db) (dbr:dbstruct-dbfile-set! dbstruct dbfullname) (dbr:dbstruct-dbfname-set! dbstruct dbfname) (dbr:dbstruct-sync-proc-set! dbstruct (lambda (last-update) (if *sync-in-progress* (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk") | > | | | | | | > | 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | (begin (debug:print 0 *default-log-port* "ERROR: Failed to properly open "dbfname-in", exiting immediately.") (exit))) ;; (assert (sqlite3:database? inmem) "FATAL: open-dbmoddb: inmem is not a db") ;; (assert (sqlite3:database? db) "FATAL: open-dbmoddb: db is not a db") (dbr:dbstruct-inmem-set! dbstruct inmem) (dbr:dbstruct-ondiskdb-set! dbstruct db) (dbr:dbstruct-dbfile-set! dbstruct dbfullname) (dbr:dbstruct-dbtmpname-set! dbstruct tmpdb) (dbr:dbstruct-dbfname-set! dbstruct dbfname) (dbr:dbstruct-sync-proc-set! dbstruct (lambda (last-update) (if *sync-in-progress* (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk") (let* ((sync-cmd (conc "megatest -db2db -from "tmpdb" -to "dbfullname"&"))) ;; (mutex-lock! *db-with-db-mutex*) ;; this mutex is used when overloaded or during a query that modifies the db (set! *sync-in-progress* #t) ;; (if (eq? (dbfile:cache-method) 'inmem) ;; (dbmod:sync-gasket tables last-update inmem db ;; dbfullname syncdir keys) (thread-start! (make-thread (lambda () (debug:print-info "Running "sync-cmd) (system sync-cmd) ;; (mutex-unlock! *db-with-db-mutex*) ;; (thread-sleep! 0.5) ;; ensure at least 1/2 second down time between sync calls (set! *sync-in-progress* #f)))) )))) ;; (dbmod:sync-tables tables #f db inmem) ;; (if db (dbmod:sync-gasket tables #f inmem db dbfullname 'fromdest keys) ;; ) ;; load into inmem (dbr:dbstruct-last-update-set! dbstruct (current-seconds)) ;; should this be offset back in time by one second? dbstruct)) ;; (if (eq? syncdir 'todisk) ;; sync to disk normally, sync from in dashboard |
︙ | ︙ |
Modified megatest.scm from [e2f14e189c] to [612430ed4b].
︙ | ︙ | |||
2571 2572 2573 2574 2575 2576 2577 | ;; (if (args:get-arg "-db2db") (let* ((duh (launch:setup)) (src-db (args:get-arg "-from")) (dest-db (args:get-arg "-to")) (sync-period (args:get-arg "-period")) ;; NOT IMPLEMENTED YET (sync-timeout (args:get-arg "-timeout")) ;; NOT IMPLEMENTED YET | | > > | > > > | | | | | | | | | > | 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 | ;; (if (args:get-arg "-db2db") (let* ((duh (launch:setup)) (src-db (args:get-arg "-from")) (dest-db (args:get-arg "-to")) (sync-period (args:get-arg "-period")) ;; NOT IMPLEMENTED YET (sync-timeout (args:get-arg "-timeout")) ;; NOT IMPLEMENTED YET (lockfile (conc dest-db".sync-lock")) (keys (db:get-keys #f)) ) (if (and src-db dest-db) (if (file-exists? src-db) (if (file-exists? lockfile) (debug:print 0 *default-log-port* "Lock "lockfile" exists, skipping sync...") (begin (with-output-to-file lockfile (lambda () (print (current-process-id)))) (debug:print-info 0 *default-log-port* "Attempting to sync data from "src-db" to "dest-db"...") (if (not (file-exists? dest-db)) (begin (debug:print 0 *default-log-port* "Using copy to create "dest-db" from "src-db) (file-copy src-db dest-db)) (let ((res (dbmod:db-to-db-sync src-db dest-db 0 (dbfile:db-init-proc) keys))) (if res (debug:print-info 0 *default-log-port* "Synced " res " records from "src-db" to "dest-db) (debug:print-info 0 *default-log-port* "No sync due to permissions or other issue.")))) (delete-file* lockfile))) (debug:print 0 *default-log-port* "No sync due to unreadble or non-existant source file"src-db)) (debug:print 0 *default-log-port* "Usage for -db2db; -to and -from must be specified")) (set! *didsomething* #t))) (if (args:get-arg "-list-test-time") (let* ((toppath (launch:setup))) (task:get-test-times) |
︙ | ︙ |
Modified tcp-transportmod.scm from [3d0d8b0130] to [f16326ea6c].
︙ | ︙ | |||
487 488 489 490 491 492 493 | (areapath (tt-areapath ttdat)) (nosyncdbpath (conc areapath"/.mtdb")) (cleanup (lambda () (if (tt-cleanup-proc ttdat) ((tt-cleanup-proc ttdat))) (dbfile:with-no-sync-db nosyncdbpath (lambda (db) | > > | > > | 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 | (areapath (tt-areapath ttdat)) (nosyncdbpath (conc areapath"/.mtdb")) (cleanup (lambda () (if (tt-cleanup-proc ttdat) ((tt-cleanup-proc ttdat))) (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct))) (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) (db:no-sync-del! db dbfname) #;(if dbtmpname (delete-file dbtmpname)))))))) (set! *server-info* ttdat) (let loop ((count 0)) (if (> count 240) (begin (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") (exit 1)) (if (not (tt-port ttdat)) ;; no connection yet |
︙ | ︙ | |||
511 512 513 514 515 516 517 | (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) (let* ((res (if db-locked-in #t | > | | | | | < > | > < | | 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 | (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) (ok (cond ((null? servers) #f) ;; not ok ((equal? (list-ref (car servers) 6) ;; compare the servinfofile (tt-servinf-file ttdat)) (let* ((res (if db-locked-in #t (let* ((lock-result (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (db:no-sync-get-lock-with-id db dbfname ;; (tt-servinf-file ttdat) (dbr:dbstruct-dbtmpname dbstruct) )))) (success (car lock-result))) (if success (begin (tt-state-set! ttdat 'running) (debug:print 0 *default-log-port* "Got server lock for " dbfname) (set! db-locked-in #t) #t) (begin (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname) #f)))))) (if (and res (common:low-noise-print 120 "top server message")) (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) res)) (else (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) (let* ((leadsrv (car servers))) (match leadsrv |
︙ | ︙ | |||
562 563 564 565 566 567 568 | (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.") (thread-sleep! 1) ;; just because #t))))) (else ;; should never get here (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) (assert #f "Bad server record "leadsrv)))))))) (if ok | < < | 567 568 569 570 571 572 573 574 575 576 577 578 579 580 | (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.") (thread-sleep! 1) ;; just because #t))))) (else ;; should never get here (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) (assert #f "Bad server record "leadsrv)))))))) (if ok (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access (begin (debug:print 0 *default-log-port* "Exiting immediately") (cleanup) (exit))) (let* ((last-update (dbr:dbstruct-last-update dbstruct)) |
︙ | ︙ | |||
645 646 647 648 649 650 651 | (let* ((areapath (tt-areapath ttdat)) (servdir (tt:get-servinfo-dir areapath)) (host (tt-host ttdat)) (port (tt-port ttdat)) (servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname)) (serv-id (tt:mk-signature areapath)) (clean-proc (lambda () | | > | 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 | (let* ((areapath (tt-areapath ttdat)) (servdir (tt:get-servinfo-dir areapath)) (host (tt-host ttdat)) (port (tt-port ttdat)) (servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname)) (serv-id (tt:mk-signature areapath)) (clean-proc (lambda () (delete-file* servinf) ))) (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname) (tt-cleanup-proc-set! ttdat clean-proc) (tt-servinf-file-set! ttdat servinf) (with-output-to-file servinf (lambda () (print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname))) serv-id)) |
︙ | ︙ |