Overview
Comment: | Keep more servers around but let them also expire quickly if not used |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.70-nohomehost |
Files: | files | file ages | folders |
SHA1: |
c0ef1c5bfada173198aa04a0ed69acc6 |
User & Date: | matt on 2022-11-22 07:49:44 |
Other Links: | branch diff | manifest | tags |
Context
2022-11-22
| ||
08:59 | Some more tweaks and output reduction. Still get crashes due to db lock but system seems to keep going pretty well. This is with 300 tests running on one machine. check-in: 4dcb84418f user: matt tags: v1.70-nohomehost | |
07:49 | Keep more servers around but let them also expire quickly if not used check-in: c0ef1c5bfa user: matt tags: v1.70-nohomehost | |
06:36 | Remove case 6 from rmt:send-receive. This was pinging the server constantly and opening addtional connections that don't appear to being closed. Also, just let the connection die, it should retry. check-in: 08646db430 user: matt tags: v1.70-nohomehost | |
Changes
Modified http-transport.scm from [9bfccef351] to [c23198333f].
︙ | ︙ | |||
409 410 411 412 413 414 415 | ;; used and to shutdown after sometime if it is not. ;; (define (http-transport:keep-running) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") | > | > | 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 | ;; used and to shutdown after sometime if it is not. ;; (define (http-transport:keep-running) ;; if none running or if > 20 seconds since ;; server last used then start shutdown ;; This thread waits for the server to come alive (debug:print-info 0 *default-log-port* "Starting the sync-back, keep alive thread in server") (let* ((servinfofile #f) (sdat #f) (no-sync-db (db:open-no-sync-db)) (tmp-area (common:get-db-tmp-area)) (started-file (conc tmp-area "/.server-started")) (server-start-time (current-seconds)) (server-info (let loop ((start-time (current-seconds)) (changed #t) (last-sdat "not this")) (begin ;; let ((sdat #f)) (thread-sleep! 0.01) (debug:print-info 0 *default-log-port* "Waiting for server alive signature") (mutex-lock! *heartbeat-mutex*) (set! sdat *server-info*) (mutex-unlock! *heartbeat-mutex*) (if (and sdat (not changed) (> (- (current-seconds) start-time) 2)) (let* ((servinfodir (conc *toppath*"/.servinfo")) (ipaddr (car sdat)) (port (cadr sdat)) (servinf (conc servinfodir"/"ipaddr":"port))) (set! servinfofile servinf) (if (not (file-exists? servinfodir)) (create-directory servinfodir #t)) (with-output-to-file servinf (lambda () (let* ((serv-id (server:mk-signature))) (set! *server-id* serv-id) (print "SERVER STARTED: "ipaddr":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)) |
︙ | ︙ | |||
494 495 496 497 498 499 500 | ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-dbs* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! (set! server-going #t) | | < < < < < < < | < < < < < < | 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 | ;; Use this opportunity to sync the tmp db to megatest.db (if (not server-going) ;; *dbstruct-dbs* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! (set! server-going #t) (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. (if (and no-sync-db (common:low-noise-print 5 "sync-all")) ;; cheesy way to reduce frequency of running sync :) (begin (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S")) (db:all-db-sync *dbstruct-dbs*)))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) (if (and (<= rem-time 4) (> rem-time 0)) |
︙ | ︙ | |||
559 560 561 562 563 564 565 | (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond | < < < < < < < < < < < < | | > > > | < | 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 | (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600))) (cond ((and *server-run* (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk? exn=" exn) (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter (not *server-overloaded*)) (change-file-times servinfofile curr-time curr-time))) (if (or (common:low-noise-print 120 "start new server") (> *api-process-request-count* 50)) ;; if this server is kind of busy start up another (begin (debug:print-info 0 *default-log-port* "Server is busy, start another if possible...") (server:kind-run *toppath*))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) (define (http-transport:server-shutdown port) (begin |
︙ | ︙ |
Modified server.scm from [787d7f0596] to [69e932d5ac].
︙ | ︙ | |||
491 492 493 494 495 496 497 | ((homehost) (cons host (am-home?))) ;; shut up old code ((home?) (am-home?)) ((best-five)(names->dats (best-five))) ((all-valid)(names->dats all-valid)) ((best) (let* ((best-five (best-five)) (len (length best-five))) (hash-table-ref serversdat (list-ref best-five (random len))))) | | | 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 | ((homehost) (cons host (am-home?))) ;; shut up old code ((home?) (am-home?)) ((best-five)(names->dats (best-five))) ((all-valid)(names->dats all-valid)) ((best) (let* ((best-five (best-five)) (len (length best-five))) (hash-table-ref serversdat (list-ref best-five (random len))))) ((count)(length all-valid)) (else (debug:print 0 *default-log-port* "ERROR: invalid command "mode) #f))) (begin (server:run areapath) (thread-sleep! 3) (case mode |
︙ | ︙ | |||
514 515 516 517 518 519 520 | ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least <server idletime> seconds old ;; (server:wait-for-server-start-last-flag areapath) | > | | 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 | ;; kind start up of server, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least <server idletime> seconds old ;; (server:wait-for-server-start-last-flag areapath) (if (< (server:choose-server areapath 'count) 10) (server:run areapath)) #;(if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 25) (debug:print-info 2 *default-log-port* "server:kind-run: touching " start-flag) (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) |
︙ | ︙ |