Changes In Branch multi-server-hack Excluding Merge-Ins
This is equivalent to a diff from 6a476e9ca7 to e7dcebe686
2017-03-24
| ||
23:35 | Merged in multi-server-hack check-in: 99a40be38a user: matt tags: v1.64 | |
22:18 | non-good commit of merge multi-server-hack into v1.64 Leaf check-in: 42e2fcd1c8 user: matt tags: private (unpublished) | |
18:22 | Fixed couple issues with the throttle by api load but still not sure it is working right. Cleaned up few other transport things. Closed-Leaf check-in: e7dcebe686 user: matt tags: multi-server-hack | |
15:55 | Default to treating a log as recent instead of ancient. This might be part of the run-away servers issue check-in: 31427e83dd user: matt tags: multi-server-hack | |
2017-03-23
| ||
16:52 | Hack for variable number of servers (default 3) check-in: e86b57ccb0 user: matt tags: multi-server-hack | |
13:56 | Added message when server is forced Closed-Leaf check-in: 6a476e9ca7 user: matt tags: runaway-servers-fix | |
13:27 | Remove -O4, fixed the force server switch. check-in: c5f5a4ad19 user: matt tags: runaway-servers-fix | |
Modified api.scm from [4067424284] to [510d0b0df2].
︙ | ︙ | |||
114 115 116 117 118 119 120 | ;; - keep it simple, only return the actual result of the call, i.e. no meta info here ;; ;; - returns #( flag result ) ;; (define (api:execute-requests dbstruct dat) (handle-exceptions exn | | < | > > | | | | | | | | | 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | ;; - keep it simple, only return the actual result of the call, i.e. no meta info here ;; ;; - returns #( flag result ) ;; (define (api:execute-requests dbstruct dat) (handle-exceptions exn (let ((call-chain (get-call-chain))) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer") (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) 'overloaded) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") (case cmd ;;=============================================== ;; READ/WRITE QUERIES |
︙ | ︙ | |||
268 269 270 271 272 273 274 | ((sdb-qry) (apply sdb:qry params)) ((ping) (current-process-id)) ;; TESTMETA ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS | | > > > | > | | | | > | | | | > | > > | 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 | ((sdb-qry) (apply sdb:qry params)) ((ping) (current-process-id)) ;; TESTMETA ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))))) ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (vector #f res) (vector #t res))))))) ;; http-server send-response ;; api:process-request ;; db:* ;; ;; NB// Runs on the server as part of the server loop ;; (define (api:process-request dbstruct $) ;; the $ is the request vars proc (set! *api-process-request-count* (+ *api-process-request-count* 1)) (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) (params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?) (resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) (success (vector-ref resdat 0)) (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) (if (not success) (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) (if (> *api-process-request-count* *max-api-process-requests*) (set! *max-api-process-requests* *api-process-request-count*)) (set! *api-process-request-count* (- *api-process-request-count* 1)) ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds ;; (rmt:dat->json-str ;; (if (or (string? res) ;; (list? res) ;; (number? res) ;; (boolean? res)) ;; res ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) (db:obj->string res transport: 'http))) |
Modified client.scm from [3c66569adb] to [5611deb23d].
︙ | ︙ | |||
76 77 78 79 80 81 82 | (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server") (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; | | | 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server") (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) (port (caddr server-dat))) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if (and (not area-dat) |
︙ | ︙ |
Modified common.scm from [22d5399385] to [9d413de21a].
︙ | ︙ | |||
148 149 150 151 152 153 154 | (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) | | | > > | 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100)) ;; default to 100 seconds (force-server #f) (ro-mode #f) (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode ;; launching and hosts (defstruct host (reachable #f) (last-update 0) (last-used 0) (last-cpuload 1)) |
︙ | ︙ | |||
412 413 414 415 416 417 418 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) | > > > | | | | | | | | | | | | | | | | > > | > > > | | 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) (handle-exceptions exn #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail. (if (file-exists? fname) (if (> (- (current-seconds)(file-modification-time fname)) expire-time) (begin (delete-file* fname) (common:simple-file-lock fname expire-time: expire-time)) #f) (let ((key-string (conc (get-host-name) "-" (current-process-id)))) (with-output-to-file fname (lambda () (print key-string))) (thread-sleep! 0.25) (if (file-exists? fname) (with-input-from-file fname (lambda () (equal? key-string (read-line)))) #f))))) (define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) (let ((end-time (+ expire-time (current-seconds)))) (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) (if got-lock #t (if (> end-time (current-seconds)) (begin (thread-sleep! 3) (loop (common:simple-file-lock fname expire-time: expire-time))) #f))))) (define (common:simple-file-release-lock fname) (handle-exceptions exn #f ;; I don't really care why this failed (at least for now) (delete-file* fname))) ;;====================================================================== ;; S T A T E S A N D S T A T U S E S ;;====================================================================== (define *common:std-states* '((0 "ARCHIVED") |
︙ | ︙ |
Modified dashboard.scm from [20957064b0] to [267452dd0e].
︙ | ︙ | |||
2716 2717 2718 2719 2720 2721 2722 | ;; Force creation of the db in case it isn't already there. (tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin | | | 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 | ;; Force creation of the db in case it isn't already there. (tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) (glob (conc dbdir "/*.db*")))))) (define (dashboard:monitor-changed? commondat tabdat) (let* ((run-update-time (current-seconds)) |
︙ | ︙ |
Modified db.scm from [df76958ce7] to [feaa389617].
︙ | ︙ | |||
2094 2095 2096 2097 2098 2099 2100 | (define (db:print-current-query-stats) ;; generate stats from *db-api-call-time* (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*) (lambda (a b) (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a))) (sum-b (common:sum (hash-table-ref *db-api-call-time* b)))) | | > > | > | > | 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 | (define (db:print-current-query-stats) ;; generate stats from *db-api-call-time* (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*) (lambda (a b) (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a))) (sum-b (common:sum (hash-table-ref *db-api-call-time* b)))) (> sum-a sum-b))))) (total 0)) (for-each (lambda (cmd-key) (let* ((dat (hash-table-ref *db-api-call-time* cmd-key)) (num (length dat)) (avg (if (> num 0) (/ (common:sum dat)(length dat))))) (set! total (+ total num)) (debug:print-info 0 *default-log-port* cmd-key "\tavg: " avg " max: " (common:max dat) " min: " (common:min-max < dat) " num: " (length dat)))) ordered-keys) (debug:print-info 0 *default-log-port* "TOTAL: " total " api calls since start."))) (define (db:get-all-run-ids dbstruct) (db:with-db dbstruct #f #f (lambda (db) |
︙ | ︙ |
Modified http-transport.scm from [44c2ce6eea] to [9769576fcd].
︙ | ︙ | |||
264 265 266 267 268 269 270 | (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) (thread-terminate! th2) (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) | | | | 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 | (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) (thread-terminate! th2) (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) (if (vector-ref res 0) ;; this is the first flag or the second flag? res ;; this is the *inner* vector? seriously? why? (if (debug:debug-mode 11) (begin ;; note: this code also called in nmsg-transport - consider consolidating it (debug:print-error 11 *default-log-port* "error occured at server, info=" (vector-ref res 2)) (debug:print 11 *default-log-port* " client call chain:") (print-call-chain (current-error-port)) (debug:print 11 *default-log-port* " server call chain:") (pp (vector-ref res 1) (current-error-port)) |
︙ | ︙ | |||
416 417 418 419 420 421 422 | (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) | | > > > | 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond |
︙ | ︙ |
Modified launch.scm from [c20e1b3984] to [7ecb4d1b9c].
︙ | ︙ | |||
469 470 471 472 473 474 475 | (start-res (http-transport:client-connect host port)) (ping-res (rmt:login-no-auto-client-setup start-res))) (if (and start-res ping-res) (let ((url (http-transport:server-dat-make-url start-res))) (remote-conndat-set! *runremote* start-res) (remote-server-url-set! *runremote* url) | > | > > > > | 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 | (start-res (http-transport:client-connect host port)) (ping-res (rmt:login-no-auto-client-setup start-res))) (if (and start-res ping-res) (let ((url (http-transport:server-dat-make-url start-res))) (remote-conndat-set! *runremote* start-res) (remote-server-url-set! *runremote* url) (if (server:ping url) (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.") (begin (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url) (remote-conndat-set! *runremote* #f) (remote-server-url-set! *runremote* #f)))) (debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.") ))))))) ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) (if (or (file-exists? top-path) (> count 10)) (change-directory top-path) |
︙ | ︙ | |||
778 779 780 781 782 783 784 | (target (common:args-get-target)) (linktree (common:get-linktree)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f)) (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) | | | 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 | (target (common:args-get-target)) (linktree (common:get-linktree)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f)) (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir))) (cxt (hash-table-ref/default *contexts* toppath #f))) ;; create our cxt for this area if it doesn't already exist (if (not cxt)(hash-table-set! *contexts* toppath (make-cxt))) ;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef) (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource |
︙ | ︙ |
Modified rmt.scm from [1adf35b1f4] to [c559126227].
︙ | ︙ | |||
54 55 56 57 58 59 60 | ;; do all the prep locked under the rmt-mutex (mutex-lock! *rmt-mutex*) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; | | | > > > > > | | | > > > > > > | 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | ;; do all the prep locked under the rmt-mutex (mutex-lock! *rmt-mutex*) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas (runremote (or area-dat *runremote*)) (readonly-mode (if (and runremote (remote-ro-mode-checked runremote)) (remote-ro-mode runremote) (let* ((dbfile (conc *toppath* "/megatest.db")) (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future (if runremote (begin (remote-ro-mode-set! runremote ro-mode) (remote-ro-mode-checked-set! runremote #t) ro-mode) ro-mode))))) ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond ;; give up if more than 15 attempts ((> attemptnum 15) (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") (exit 1)) |
︙ | ︙ | |||
92 93 94 95 96 97 98 | (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area | | | 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area ((not runremote) ;; can remove this one. should never get here. (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record ((not (pair? (remote-hh-dat runremote))) ;; not on homehost (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little |
︙ | ︙ | |||
154 155 156 157 158 159 160 | ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one (not (remote-conndat runremote))) (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost (not (remote-conndat runremote)))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) | > | | 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one (not (remote-conndat runremote))) (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost (not (remote-conndat runremote)))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? (server:start-and-wait *toppath*)) (remote-force-server-set! runremote (common:force-server?)) (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query ((and (not (remote-force-server runremote)) (cdr (remote-hh-dat runremote))) ;; we are on homehost (mutex-unlock! *rmt-mutex*) |
︙ | ︙ | |||
183 184 185 186 187 188 189 | (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time ;; (mutex-unlock! *rmt-mutex*) | | < < < | > | | > | < > | > | | | | 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (symbol? res) (eq? res 'overloaded)) (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! runremote #f) (remote-server-url-set! runremote #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") (if (not (server:check-if-running *toppath*)) (server:start-and-wait *toppath*)) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions ;; exn ;; (begin |
︙ | ︙ |
Modified server.scm from [ba8be5ee9a] to [30915f5112].
︙ | ︙ | |||
138 139 140 141 142 143 144 | (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds ;; (define (server:logf-get-start-info logf) (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs | > > > | | | | | | | | | | | | | | | | < > | | < > | | 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds ;; (define (server:logf-get-start-info logf) (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs (handle-exceptions exn (list #f #f #f) ;; no idea what went wrong, call it a bad server (with-input-from-file logf (lambda () (let loop ((inl (read-line)) (lnum 0)) (if (not (eof-object? inl)) (let ((mlst (string-match rx inl))) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) (list #f #f #f)) (let ((dat (cdr mlst))) (list (car dat) ;; host (string->number (cadr dat)) ;; port (string->number (caddr dat)))))) (list #f #f #f)))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) '() (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) '())) (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions exn (current-seconds) ;; 0 (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time day-seconds)) (server:logf-get-start-info hed) '())) ;; don't waste time processing server files not touched in the past day if there are more than ten servers to look at (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) |
︙ | ︙ | |||
216 217 218 219 220 221 222 | ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) | > > | | | | | | | | | | | > > > > > > > > > > > | 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 | ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) (if (and (list? rec) (> (length rec) 2)) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 0) ;; been running at least 0 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds (< (- now start-time) (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) 180) (random 360))) ;; under one hour running time +/- 180 )) #f)) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) (define (server:get-first-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) (define (server:get-rand-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and (list? srvrs) (not (null? srvrs))) (let* ((len (length srvrs)) (idx (random len))) (list-ref srvrs idx)) #f))) (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) (conc host ":" port) #f))) |
︙ | ︙ | |||
290 291 292 293 294 295 296 | (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; | | > > | > > | > > > | 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 | (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath #!key (numservers "2")) (let* ((ns (string->number (or (configf:lookup *configdat* "server" "numservers") numservers))) (servers (server:get-best (server:get-list areapath)))) ;; (print "servers: " servers " ns: " ns) (if (or (and servers (null? servers)) (not servers) (and (list? servers) (< (length servers) (random ns)))) ;; somewhere between 0 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res res (if (null? tal) |
︙ | ︙ |
Modified tests.scm from [bccd138868] to [897e386321].
︙ | ︙ | |||
484 485 486 487 488 489 490 | (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)) (common:simple-file-release-lock lockf) (change-directory orig-dir) ;; NB// tests:test-set-toplog! is remote internal... (tests:test-set-toplog! run-id test-name outputfilename)) ;; didn't get the lock, check to see if current update started later than this ;; update, if so we can exit without doing any work | > > > | | 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 | (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)) (common:simple-file-release-lock lockf) (change-directory orig-dir) ;; NB// tests:test-set-toplog! is remote internal... (tests:test-set-toplog! run-id test-name outputfilename)) ;; didn't get the lock, check to see if current update started later than this ;; update, if so we can exit without doing any work (if (> my-start-time (handle-exceptions exn 0 (file-modification-time lockf))) ;; we started since current re-gen in flight, delay a little and try again (begin (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it") (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds (loop (common:simple-file-lock lockf)))))))))) (define (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename) |
︙ | ︙ |