Overview
Comment: | Added megatest version to server log output for future use. Use last touch time on megatest.db to decide on doing a sync, this allows concurrent servers to not fight to do the sync back. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | server-log-handshaking |
Files: | files | file ages | folders |
SHA1: |
d2c1247c440dc6aeed42603620b4d8b3 |
User & Date: | matt on 2017-01-29 15:33:50 |
Other Links: | branch diff | manifest | tags |
Context
2017-01-29
| ||
16:10 | Trim some of the server records returned from get-list check-in: 5272b8e023 user: matt tags: server-log-handshaking | |
15:33 | Added megatest version to server log output for future use. Use last touch time on megatest.db to decide on doing a sync, this allows concurrent servers to not fight to do the sync back. check-in: d2c1247c44 user: matt tags: server-log-handshaking | |
2017-01-28
| ||
22:34 | Added timestamp touch on server logs, rock solid now with only one server being sustained. check-in: 2a497f95de user: matt tags: server-log-handshaking | |
Changes
Modified common.scm from [9ceabb42c1] to [3380145d50].
︙ | ︙ | |||
570 571 572 573 574 575 576 | (define *wdnum* 0) (define *wdnum*mutex (make-mutex)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:watchdog) | < | < | > > < > > > | | > | 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 | (define *wdnum* 0) (define *wdnum*mutex (make-mutex)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:watchdog) (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) (let* ((dbstruct (db:setup)) (mtdb (dbr:dbstruct-mtdb dbstruct)) (mtpath (db:dbdat-get-path mtdb))) (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") (let loop () ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) (should-sync (and (not *time-to-exit*) (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum (start-time (current-seconds)) (mt-mod-time (file-modification-time mtpath)) (recently-synced (> (- start-time mt-mod-time) 4)) (will-sync (and (or need-sync should-sync) (not sync-in-progress) (not recently-synced)))) ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) (mutex-unlock! *db-multi-sync-mutex*) (if will-sync (let ((res (common:sync-to-megatest.db dbstruct))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive (if (> res 0) ;; some records were transferred, keep the db alive (begin |
︙ | ︙ |
Modified db.scm from [0ef09f1d37] to [196dbb1410].
︙ | ︙ | |||
11 12 13 14 15 16 17 | ;;====================================================================== ;; Database access ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc | | | | 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | ;;====================================================================== ;; Database access ;;====================================================================== ;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc (use (srfi 18) extras tcp stack) (use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest base64 format dot-locking z3 typed-records matchable) (import (prefix sqlite3 sqlite3:)) (import (prefix base64 base64:)) (declare (unit db)) (declare (uses common)) (declare (uses keys)) (declare (uses ods)) (declare (uses client)) (declare (uses mt)) |
︙ | ︙ | |||
2038 2039 2040 2041 2042 2043 2044 2045 2046 | (lambda (run-id) (set! run-ids (cons run-id run-ids))) db "SELECT id FROM runs WHERE state != 'deleted' ORDER BY event_time DESC;") (reverse run-ids))))) ;; get some basic run stats ;; ;; ( (runname (( state count ) ... )) | > > | > | 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 | (lambda (run-id) (set! run-ids (cons run-id run-ids))) db "SELECT id FROM runs WHERE state != 'deleted' ORDER BY event_time DESC;") (reverse run-ids))))) ;; get some basic run stats ;; ;; data structure: ;; ;; ( (runname (( state count ) ... )) ;; ( ... ;; (define (db:get-run-stats dbstruct) (let* ((totals (make-hash-table)) (curr (make-hash-table)) (res '()) (runs-info '())) ;; First get all the runname/run-ids (db:with-db |
︙ | ︙ |
Modified http-transport.scm from [05f38775ca] to [e6b844f91c].
︙ | ︙ | |||
377 378 379 380 381 382 383 | (start-time (current-milliseconds))) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-db* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) | | | 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 | (start-time (current-milliseconds))) ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-db* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-db* (db:setup)) ;; run-id)) (set! server-going #t) (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. (thread-start! *watchdog*))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) (if (and (<= rem-time 4) |
︙ | ︙ | |||
423 424 425 426 427 428 429 | (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* | | | > | | 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 | (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* (> (+ last-access server-timeout) (current-seconds)) (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (change-file-times server-log-file curr-time curr-time))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) (define (http-transport:server-shutdown port) (let ((tdbdat (tasks:open-db))) ;;(BB> "http-transport:server-shutdown called") (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; |
︙ | ︙ |