Overview
Comment: | Show connection stats every 60 seconds. Remove stat of megatest.db from rmt:send-receive, it was happening on every call. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | multi-server-hack |
Files: | files | file ages | folders |
SHA1: |
6baac6187eacc0b730baefcba631fb01 |
User & Date: | matt on 2017-03-24 13:51:20 |
Other Links: | branch diff | manifest | tags |
Context
2017-03-24
| ||
14:52 | Tell calling client to wait and try again if server is overloaded (in terms of parallel api calls over 25) check-in: fd3c06195d user: matt tags: multi-server-hack | |
13:51 | Show connection stats every 60 seconds. Remove stat of megatest.db from rmt:send-receive, it was happening on every call. check-in: 6baac6187e user: matt tags: multi-server-hack | |
11:27 | Merged v1.63 changes to multi-server-hack check-in: 8a6ca9fd18 user: matt tags: multi-server-hack | |
Changes
Modified api.scm from [4067424284] to [d0edc7e79b].
︙ | ︙ | |||
269 270 271 272 273 274 275 | ((ping) (current-process-id)) ;; TESTMETA ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)))))) | | | | | | > | | 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 | ((ping) (current-process-id)) ;; TESTMETA ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)))))) ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if (not writecmd-in-readonly-mode) (vector #t res) (vector #f res))))))) ;; http-server send-response ;; api:process-request ;; db:* ;; ;; NB// Runs on the server as part of the server loop |
︙ | ︙ |
Modified common.scm from [4158ce55d8] to [9d413de21a].
︙ | ︙ | |||
148 149 150 151 152 153 154 | (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) | | | > > | 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100)) ;; default to 100 seconds (force-server #f) (ro-mode #f) (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode ;; launching and hosts (defstruct host (reachable #f) (last-update 0) (last-used 0) (last-cpuload 1)) |
︙ | ︙ |
Modified db.scm from [df76958ce7] to [feaa389617].
︙ | ︙ | |||
2094 2095 2096 2097 2098 2099 2100 | (define (db:print-current-query-stats) ;; generate stats from *db-api-call-time* (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*) (lambda (a b) (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a))) (sum-b (common:sum (hash-table-ref *db-api-call-time* b)))) | | > > | > | > | 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 | (define (db:print-current-query-stats) ;; generate stats from *db-api-call-time* (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*) (lambda (a b) (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a))) (sum-b (common:sum (hash-table-ref *db-api-call-time* b)))) (> sum-a sum-b))))) (total 0)) (for-each (lambda (cmd-key) (let* ((dat (hash-table-ref *db-api-call-time* cmd-key)) (num (length dat)) (avg (if (> num 0) (/ (common:sum dat)(length dat))))) (set! total (+ total num)) (debug:print-info 0 *default-log-port* cmd-key "\tavg: " avg " max: " (common:max dat) " min: " (common:min-max < dat) " num: " (length dat)))) ordered-keys) (debug:print-info 0 *default-log-port* "TOTAL: " total " api calls since start."))) (define (db:get-all-run-ids dbstruct) (db:with-db dbstruct #f #f (lambda (db) |
︙ | ︙ |
Modified http-transport.scm from [44c2ce6eea] to [9751cbc3b5].
︙ | ︙ | |||
416 417 418 419 420 421 422 | (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) | | > > > | 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond |
︙ | ︙ |
Modified launch.scm from [442d86a53d] to [7ecb4d1b9c].
︙ | ︙ | |||
783 784 785 786 787 788 789 | (target (common:args-get-target)) (linktree (common:get-linktree)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f)) (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) | | | 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 | (target (common:args-get-target)) (linktree (common:get-linktree)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f)) (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir))) (cxt (hash-table-ref/default *contexts* toppath #f))) ;; create our cxt for this area if it doesn't already exist (if (not cxt)(hash-table-set! *contexts* toppath (make-cxt))) ;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef) (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource |
︙ | ︙ |
Modified rmt.scm from [1adf35b1f4] to [4b028f3c38].
︙ | ︙ | |||
54 55 56 57 58 59 60 | ;; do all the prep locked under the rmt-mutex (mutex-lock! *rmt-mutex*) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; | | | > > > > > | | | > > > > > > | 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | ;; do all the prep locked under the rmt-mutex (mutex-lock! *rmt-mutex*) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas (runremote (or area-dat *runremote*)) (readonly-mode (if (and runremote (remote-ro-mode-checked runremote)) (remote-ro-mode runremote) (let* ((dbfile (conc *toppath* "/megatest.db")) (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future (if runremote (begin (remote-ro-mode-set! runremote ro-mode) (remote-ro-mode-checked-set! runremote #t) ro-mode) ro-mode))))) ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond ;; give up if more than 15 attempts ((> attemptnum 15) (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") (exit 1)) |
︙ | ︙ | |||
92 93 94 95 96 97 98 | (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area | | | 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area ((not runremote) ;; can remove this one. should never get here. (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record ((not (pair? (remote-hh-dat runremote))) ;; not on homehost (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little |
︙ | ︙ |
Modified server.scm from [a07a79fe32] to [65b45e968a].
︙ | ︙ | |||
244 245 246 247 248 249 250 | (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) (define (server:get-rand-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) | | > | 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) (define (server:get-rand-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and (list? srvrs) (not (null? srvrs))) (let* ((len (length srvrs)) (idx (random len))) (list-ref srvrs idx)) #f))) (define (server:record->url servr) |
︙ | ︙ |