Overview
Comment: | really implement the cross-server locking this time |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-dump-for-sync |
Files: | files | file ages | folders |
SHA1: |
f0c62a0732e6c56fb7384bbc13e779fb |
User & Date: | bjbarcla on 2019-02-08 18:49:03 |
Other Links: | branch diff | manifest | tags |
Context
2019-02-11
| ||
11:30 | made server messages such that sync handily summarized by watch "grep -h SYNC server-*.log | sort | tail -30" Closed-Leaf check-in: ef2ec4a2aa user: bjbarcla tags: v1.65-dump-for-sync | |
2019-02-08
| ||
18:49 | really implement the cross-server locking this time check-in: f0c62a0732 user: bjbarcla tags: v1.65-dump-for-sync | |
16:48 | fixes check-in: bfc2a0c173 user: bjbarcla tags: v1.65-dump-for-sync | |
Changes
Modified common.scm from [ddadf02a13] to [b6c40dc319].
︙ | ︙ | |||
355 356 357 358 359 360 361 | (ok-flag (let ((res (system cmd))) (cond ((eq? 0 res) #t) (else (set! ok-flag #f) | | | | 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | (ok-flag (let ((res (system cmd))) (cond ((eq? 0 res) #t) (else (set! ok-flag #f) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Command failed with exit code " (if (< res 0) res (/ res 8)) " ["cmd"]" ) #f)))) (else (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Not runnining command due to prior error. ["cmd"]") #f)))) (copy (lambda (src dest) (docmd (conc "/bin/cp '"src"' '"dest"'")))) (copy+zip (lambda (src dest) (docmd (conc "gzip -c - < '"src"' > '"dest"'")))) (fullpath (realpath filepath)) (basedir (pathname-directory fullpath)) (basefile (pathname-strip-directory fullpath)) ;;(prevfile (conc filepath ".prev.gz")) |
︙ | ︙ | |||
829 830 831 832 833 834 835 836 837 838 | ;; (and ohh srv))) ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) (define *wdnum* 0) (define *wdnum*mutex (make-mutex)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; | > > > > > > < < | 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 | ;; (and ohh srv))) ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) (define *wdnum* 0) (define *wdnum*mutex (make-mutex)) (define (common:human-time) (time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S")) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:readonly-watchdog dbstruct) (thread-sleep! 0.05) ;; delay for startup (debug:print-info 13 *default-log-port* "common:readonly-watchdog entered.") ;; sync megatest.db to /tmp/.../megatst.db (let* ((sync-cool-off-duration 3) (golden-mtdb (dbr:dbstruct-mtdb dbstruct)) (golden-mtpath (db:dbdat-get-path golden-mtdb)) |
︙ | ︙ | |||
1240 1241 1242 1243 1244 1245 1246 | ;; first look in config, then look in file .homehost, create it if not found (homehost (or (configf:lookup *configdat* "server" "homehost" ) (handle-exceptions exn (if (> trynum 0) (let ((delay-time (* (- 5 trynum) 5))) (mutex-unlock! *homehost-mutex*) | | | | 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 | ;; first look in config, then look in file .homehost, create it if not found (homehost (or (configf:lookup *configdat* "server" "homehost" ) (handle-exceptions exn (if (> trynum 0) (let ((delay-time (* (- 5 trynum) 5))) (mutex-unlock! *homehost-mutex*) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file, delaying " delay-time " seconds and trying again, message: " ((condition-property-accessor 'exn 'message) exn)) (thread-sleep! delay-time) (common:get-homehost trynum: (- trynum 1))) (begin (mutex-unlock! *homehost-mutex*) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Failed to read .homehost file after trying five times. Giving up and exiting, message: " ((condition-property-accessor 'exn 'message) exn)) (exit 1))) (let ((hhf (conc *toppath* "/.homehost"))) (if (common:file-exists? hhf) (with-input-from-file hhf read-line) (if (file-write-access? *toppath*) (begin (with-output-to-file hhf |
︙ | ︙ | |||
2766 2767 2768 2769 2770 2771 2772 | (if all-good (let ((cmddat (make-qitem command: command host-port: host-port params: params))) (queue-push cmddat) ;; put request into the queue (nn-send soc "queued")) ;; reply with "queued" | | | 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 | (if all-good (let ((cmddat (make-qitem command: command host-port: host-port params: params))) (queue-push cmddat) ;; put request into the queue (nn-send soc "queued")) ;; reply with "queued" (print "ERROR: ["(common:human-time)"] BAD request " dat)) (loop (nn-recv soc))))) (nn-close soc))) ;;====================================================================== ;; D A S H B O A R D U S E R V I E W S |
︙ | ︙ |
Modified server.scm from [4b55838c84] to [0dc738d239].
︙ | ︙ | |||
514 515 516 517 518 519 520 | (lockfile (conc tmp-db ".lock")) (sync-cmd (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 30))) (if (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync (args:get-arg "-server")) (let loop () (thread-sleep! min-intersync-delay) | | > | > | | > > > > > | 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 | (lockfile (conc tmp-db ".lock")) (sync-cmd (conc sqlite-exe" " tmp-db " .dump | "sqlite-exe" " staging-file "&>"sync-log)) (min-intersync-delay (configf:lookup-number *configdat* "server" "minimum-intersync-delay" default: 30))) (if (and (not (args:get-arg "-sync-to-megatest.db")) ;; conditions under which we do not run the sync (args:get-arg "-server")) (let loop () (thread-sleep! min-intersync-delay) (if (common:simple-file-lock lockfile) (begin (if (not (configf:lookup *configdat* "server" "disable-db-snapshot")) (common:snapshot-file mtdbfile subdir: ".db-snapshot")) (delete-file* staging-file) (let ((start-time (current-milliseconds)) (res (system sync-cmd))) (cond ((eq? 0 res) (delete-file* (conc mtdbfile ".backup")) (system (conc "/bin/mv " staging-file " " mtdbfile)) (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] SYNC took "(/ (- (current-milliseconds) start-time))" sec") #t) (else (system (conc "/bin/cp "sync-log" "sync-log".fail")) (debug:print 0 *default-log-port* "ERROR: ["(common:human-time)"] Sync failed. See log at "sync-log".fail") (if (file-exists? (conc mtdbfile ".backup")) (system (conc "/bin/cp "mtdbfile ".backup " mtdbfile))))) (common:simple-file-release-lock lockfile))) ;; else (debug:print 1 *default-log-port* "INFO: ["(common:human-time)"] other sync in progres; not syncing.") ) ;; end if got lockfile ;; keep going unless time to exit ;; (if (not *time-to-exit*) (let delay-loop ((count 0)) ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) |
︙ | ︙ |