Overview
Comment: | Some awful hacks to keep the system running. There is something causing servers to crash, I suspect sync is the problem. This work-around just constantly replaces the servers with new ones. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.70 |
Files: | files | file ages | folders |
SHA1: |
3cdcb8c1388aca6d665dcd1437084c56 |
User & Date: | matt on 2022-05-22 20:20:00 |
Other Links: | branch diff | manifest | tags |
Context
2022-05-27
| ||
19:21 | Commented out some not-used fuctions, removed the server start every 120 seconds and added dbfile handle count check-in: b1db729de1 user: matt tags: v1.70 | |
2022-05-22
| ||
20:20 | Some awful hacks to keep the system running. There is something causing servers to crash, I suspect sync is the problem. This work-around just constantly replaces the servers with new ones. check-in: 3cdcb8c138 user: matt tags: v1.70 | |
18:02 | Cleaned up some gratuitous database opens, quietened some debug messages check-in: a6be57bfc9 user: matt tags: v1.70 | |
Changes
Modified api.scm from [b65cdceb6b] to [64bd840562].
︙ | ︙ | |||
155 156 157 158 159 160 161 | (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn) (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) | | | 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn) (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 200) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in |
︙ | ︙ |
Modified db.scm from [18a5213140] to [66cbe6fa4e].
︙ | ︙ | |||
1059 1060 1061 1062 1063 1064 1065 | (define (db:lock-and-delta-sync no-sync-db dbstruct from-db-file runid) (assert (not *db-sync-in-progress*) "FATAL: db:lock-and-sync called while a sync is in progress.") (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync") (let* ((lockdat (db:no-sync-get-lock no-sync-db from-db-file)) (gotlock (car lockdat)) (locktime (cdr lockdat))) | | | | 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 | (define (db:lock-and-delta-sync no-sync-db dbstruct from-db-file runid) (assert (not *db-sync-in-progress*) "FATAL: db:lock-and-sync called while a sync is in progress.") (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync") (let* ((lockdat (db:no-sync-get-lock no-sync-db from-db-file)) (gotlock (car lockdat)) (locktime (cdr lockdat))) (debug:print-info 3 *default-log-port* "db:lock-and-delta-sync: got lock?") (if gotlock (begin (debug:print 0 *default-log-port* "db:lock-and-delta-sync copying db "runid" at "(current-seconds)) (db:sync-touched dbstruct runid) (db:no-sync-del! no-sync-db from-db-file) #t) (begin (debug:print 0 *default-log-port* "could not get lock for " from-db-file " from no-sync-db") #f )))) |
︙ | ︙ |
Modified http-transport.scm from [c12a4eb4f0] to [3269081060].
︙ | ︙ | |||
473 474 475 476 477 478 479 | (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! (set! server-going #t) (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. ;; (thread-start! *watchdog*) ) | | > | 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 | (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! (set! server-going #t) (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version)) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. ;; (thread-start! *watchdog*) ) (if (and no-sync-db (common:low-noise-print 5 "sync-all")) ;; cheesy way to reduce frequency of running sync :) (begin (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S")) (db:all-db-sync *dbstruct-dbs*) ;; (db:do-sync no-sync-db) ;; (db:run-lock-and-sync *no-sync-db*) ) ) |
︙ | ︙ | |||
528 529 530 531 532 533 534 535 536 537 538 539 540 541 | (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions | > > > > > > > > > > > > | 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 | (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond ((and *server-run* (> (- (current-seconds) server-start-time) 120)) ;; let's try server replacement ;; ((adj-proc-load . 0.056875) (adj-core-load . 0.11375) (1m-load . 0.91) (5m-load . 0.77) (15m-load . 1.0) (proc . 16) (core . 8) (phys . 1)) (let* ((loaddat (common:get-normalized-cpu-load #f)) (adj-proc-load (alist-ref 'adj-proc-load loaddat)) (adj-core-load (alist-ref 'adj-core-load loaddat)) (adj-load (max adj-proc-load adj-core-load))) (if (< adj-load 2) ;; reduce chance of runaway (server:run *toppath*)) (db:all-db-sync *dbstruct-dbs*) (thread-sleep! 30) (http-transport:server-shutdown port))) ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions |
︙ | ︙ |