Overview
Comment: | improved server stability |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.63 |
Files: | files | file ages | folders |
SHA1: |
6b5c7789cb09e64c09ec9326d70e2061 |
User & Date: | bjbarcla on 2016-12-27 17:15:48 |
Other Links: | branch diff | manifest | tags |
Context
2016-12-28
| ||
16:35 | fixed condition where watchdog continued to process many times rapidly when time-to-exit is #t check-in: 88034605c0 user: bjbarcla tags: v1.63 | |
14:46 | Manual merge to integ-home to fix a conflict check-in: 6673cb2ce0 user: mrwellan tags: integ-home | |
2016-12-27
| ||
17:15 | improved server stability check-in: 6b5c7789cb user: bjbarcla tags: v1.63 | |
2016-12-22
| ||
17:41 | corrected close-all-connections in exit proc to avoid stack dump check-in: 9f7ecc5050 user: bjbarcla tags: v1.63 | |
Changes
Modified api.scm from [19e6c44e9d] to [1debddd502].
︙ | ︙ | |||
60 61 62 63 64 65 66 67 68 69 70 71 72 73 | testmeta-get-record have-incompletes? synchash-get )) (define api:write-queries '( ;; SERVERS start-server kill-server ;; TESTS test-set-state-status-by-id delete-test-records | > > | 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | testmeta-get-record have-incompletes? synchash-get )) (define api:write-queries '( get-keys-write ;; dummy "write" query to force server start ;; SERVERS start-server kill-server ;; TESTS test-set-state-status-by-id delete-test-records |
︙ | ︙ | |||
187 188 189 190 191 192 193 194 195 196 197 198 199 200 | ;;====================================================================== ;; READ ONLY QUERIES ;;====================================================================== ;; KEYS ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) ((get-keys) (db:get-keys dbstruct)) ((get-key-vals) (apply db:get-key-vals dbstruct params)) ((get-target) (apply db:get-target dbstruct params)) ((get-targets) (db:get-targets dbstruct)) ;; ARCHIVES ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) | > | 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | ;;====================================================================== ;; READ ONLY QUERIES ;;====================================================================== ;; KEYS ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) ((get-keys) (db:get-keys dbstruct)) ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server ((get-key-vals) (apply db:get-key-vals dbstruct params)) ((get-target) (apply db:get-target dbstruct params)) ((get-targets) (db:get-targets dbstruct)) ;; ARCHIVES ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) |
︙ | ︙ |
Modified common.scm from [4f9e9ae478] to [ac422f66d5].
︙ | ︙ | |||
551 552 553 554 555 556 557 | ;;;; run-ids ;; if #f use *db-local-sync* : or 'local-sync-flags ;; if #t use timestamps : or 'timestamps (define (common:sync-to-megatest.db dbstruct) (let ((start-time (current-seconds)) (res (db:multi-db-sync dbstruct 'new2old))) (let ((sync-time (- (current-seconds) start-time))) | | | | > < | | 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 | ;;;; run-ids ;; if #f use *db-local-sync* : or 'local-sync-flags ;; if #t use timestamps : or 'timestamps (define (common:sync-to-megatest.db dbstruct) (let ((start-time (current-seconds)) (res (db:multi-db-sync dbstruct 'new2old))) (let ((sync-time (- (current-seconds) start-time))) (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)) (if (common:low-noise-print 30 "sync new to old") (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)))) res)) ;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp ;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server) ;; (define (common:watchdog) (thread-sleep! 0.05) ;; delay for startup (let ((legacy-sync (common:run-sync?)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds))) (debug:print-info 0 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)) (if legacy-sync (let ((dbstruct (db:setup))) (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") (let loop () ;;(BB> "watchdog loop. pid="(current-process-id)) ;; sync for filesystem local db writes ;; (mutex-lock! *db-multi-sync-mutex*) (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write (sync-in-progress *db-sync-in-progress*) (should-sync (> (- (current-seconds) *db-last-sync*) 5)) ;; sync every five seconds minimum (will-sync (and (or need-sync should-sync) (not sync-in-progress))) (start-time (current-seconds))) ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) (if will-sync (set! *db-sync-in-progress* #t)) |
︙ | ︙ | |||
614 615 616 617 618 619 620 | (if (and (not *time-to-exit*) (< count 4)) ;; was 11, changing to 4. (begin (thread-sleep! 1) (delay-loop (+ count 1)))) (loop))) (if (common:low-noise-print 30) | | | 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 | (if (and (not *time-to-exit*) (< count 4)) ;; was 11, changing to 4. (begin (thread-sleep! 1) (delay-loop (+ count 1)))) (loop))) (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)))))))) (define (std-exit-procedure) (on-exit (lambda () 0)) (let ((no-hurry (if *time-to-exit* ;; hurry up #f (begin (set! *time-to-exit* #t) |
︙ | ︙ |
Modified http-transport.scm from [ec36961174] to [285317a3d3].
︙ | ︙ | |||
484 485 486 487 488 489 490 | ;; (if (and (<= rem-time 4) ;; (> rem-time 0)) ;; (thread-sleep! rem-time) ;; (thread-sleep! 4))) ;; fallback for if the math is changed ... (define (http-transport:server-shutdown server-id port) (let ((tdbdat (tasks:open-db))) | | | 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 | ;; (if (and (<= rem-time 4) ;; (> rem-time 0)) ;; (thread-sleep! rem-time) ;; (thread-sleep! 4))) ;; fallback for if the math is changed ... (define (http-transport:server-shutdown server-id port) (let ((tdbdat (tasks:open-db))) (debug:print-info 0 *default-log-port* "Starting to shutdown the server. pid="(current-process-id)) ;; ;; start_shutdown ;; (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "shutting-down") (set! *time-to-exit* #t) ;; tell on-exit to be fast as we've already cleaned up (portlogger:open-run-close portlogger:set-port port "released") (thread-sleep! 5) |
︙ | ︙ |
Modified rmt.scm from [7e5abf90aa] to [c70f311cf0].
︙ | ︙ | |||
84 85 86 87 88 89 90 91 92 93 94 95 96 97 | (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a read ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost and this is a write, we already have a server ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) | > > > > > > > > > > > > > | | | | | | > > | 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a read ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*) ;; have a server (not (server:read-dotserver *toppath*))) ;; server has died. (set! *runremote* #f) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a write, we already have a server ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) ;; commented by bb; this was blocking server passive start on write on homehost (case 5) ;; ;; on homehost and this is a write, we have a server (we know because case 4 checked) ;; ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost ;; (not (member cmd api:read-only-queries))) ;; (mutex-unlock! *rmt-mutex*) ;; (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") ;; (rmt:open-qry-close-locally cmd 0 params)) ;; no server contact made and this is a write, passively start a server ((and (not (remote-server-url *runremote*)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((serverconn (server:read-dotserver *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if serverconn (remote-server-url-set! *runremote* serverconn) ;; the string can be consumed by the client setup if needed |
︙ | ︙ | |||
347 348 349 350 351 352 353 354 355 356 357 358 359 360 | (rmt:send-receive 'get-key-val-pairs run-id (list run-id))) (define (rmt:get-keys) (if *db-keys* *db-keys* (let ((res (rmt:send-receive 'get-keys #f '()))) (set! *db-keys* res) res))) ;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe ;; to cache the resuls in a hash ;; (define (rmt:get-key-vals run-id) (or (hash-table-ref/default *keyvals* run-id #f) (let ((res (rmt:send-receive 'get-key-vals #f (list run-id)))) | > > > > > | 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 | (rmt:send-receive 'get-key-val-pairs run-id (list run-id))) (define (rmt:get-keys) (if *db-keys* *db-keys* (let ((res (rmt:send-receive 'get-keys #f '()))) (set! *db-keys* res) res))) (define (rmt:get-keys-write) ;; dummy query to force server start (let ((res (rmt:send-receive 'get-keys-write #f '()))) (set! *db-keys* res) res)) ;; we don't reuse run-id's (except possibly *after* a db cleanup) so it is safe ;; to cache the resuls in a hash ;; (define (rmt:get-key-vals run-id) (or (hash-table-ref/default *keyvals* run-id #f) (let ((res (rmt:send-receive 'get-key-vals #f (list run-id)))) |
︙ | ︙ |