Overview
Comment: | Corrected issue with server overload handling. Switch to using a less burdensome server ping instead of server:check-if-running |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.64 |
Files: | files | file ages | folders |
SHA1: |
fd6bcc8b75450d820420f86aa3bd4122 |
User & Date: | matt on 2017-06-01 00:21:31 |
Other Links: | branch diff | manifest | tags |
Context
2017-06-01
| ||
22:08 | Fixed sync to megatest.db, it was not respecting last_update and thus syncing everything. Added hard storage of last update in a no-sync db on /tmp check-in: b665288f56 user: matt tags: v1.64 | |
00:21 | Corrected issue with server overload handling. Switch to using a less burdensome server ping instead of server:check-if-running check-in: fd6bcc8b75 user: matt tags: v1.64 | |
2017-05-31
| ||
23:05 | Improved messages in couple noisy cases. Add flag for turning off touching of server logs when max api calls situation occurs check-in: bc2a1779bc user: matt tags: v1.64 | |
Changes
Modified http-transport.scm from [2c955425b7] to [e6367359e1].
︙ | ︙ | |||
293 294 295 296 297 298 299 | (define (http-transport:close-connections #!key (area-dat #f)) (let* ((runremote (or area-dat *runremote*)) (server-dat (if runremote (remote-conndat runremote) #f))) ;; (hash-table-ref/default *runremote* run-id #f))) (if (vector? server-dat) (let ((api-dat (http-transport:server-dat-get-api-uri server-dat))) | > > > > > | | | 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 | (define (http-transport:close-connections #!key (area-dat #f)) (let* ((runremote (or area-dat *runremote*)) (server-dat (if runremote (remote-conndat runremote) #f))) ;; (hash-table-ref/default *runremote* run-id #f))) (if (vector? server-dat) (let ((api-dat (http-transport:server-dat-get-api-uri server-dat))) (handle-exceptions exn (begin (print-call-chain *default-log-port*) (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn))) (close-connection! api-dat) #t)) #f))) (define (make-http-transport:server-dat)(make-vector 6)) (define (http-transport:server-dat-get-iface vec) (vector-ref vec 0)) (define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) |
︙ | ︙ | |||
432 433 434 435 436 437 438 | (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* | < > | | 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* (> (+ last-access server-timeout) (current-seconds)) (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk?") (if (not *server-overloaded*) (change-file-times server-log-file curr-time curr-time))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) (define (http-transport:server-shutdown port) (begin |
︙ | ︙ |
Modified megatest-version.scm from [132e4528f7] to [397d84c9d5].
1 2 3 4 5 | ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) | | | 1 2 3 4 5 6 7 | ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) (define megatest-version 1.6417) |
Modified rmt.scm from [f39fa419cf] to [7ffa1b548f].
︙ | ︙ | |||
131 132 133 134 135 136 137 | ;;DOT CASE4 -> "rmt:send-receive"; ;; reset the connection if it has been unused too long ((and runremote (remote-conndat runremote) (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #<request>.\n message: Server closed connection before sending response" (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") | < < < < < < | | 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | ;;DOT CASE4 -> "rmt:send-receive"; ;; reset the connection if it has been unused too long ((and runremote (remote-conndat runremote) (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #<request>.\n message: Server closed connection before sending response" (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") (http-transport:close-connections area-dat: runremote) (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;;DOT CASE5 [label="local\nread"]; ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; ;;DOT CASE5 -> "rmt:open-qry-close-locally"; |
︙ | ︙ | |||
254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 | (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (vector? res) (eq? (vector-length res) 2) (eq? (vector-ref res 1) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (mutex-lock! *rmt-mutex*) (set! *runremote* #f) ;; force starting over (mutex-unlock! *rmt-mutex*) (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! runremote #f) (remote-server-url-set! runremote #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") | > > | | | 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 | (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (vector? res) (eq? (vector-length res) 2) (eq? (vector-ref res 1) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (mutex-lock! *rmt-mutex*) (http-transport:close-connections area-dat: runremote) (set! *runremote* #f) ;; force starting over (mutex-unlock! *rmt-mutex*) (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! runremote #f) (http-transport:close-connections area-dat: runremote) (remote-server-url-set! runremote #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") ;; (if (not (server:check-if-running *toppath*)) ;; (server:start-and-wait *toppath*)) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;;DOT } ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions |
︙ | ︙ |
Modified runs.scm from [180cc9d9e4] to [50559e5a2a].
︙ | ︙ | |||
1267 1268 1269 1270 1271 1272 1273 | itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) ;; every 15 minutes verify the server is there for this run (if (and (common:low-noise-print 240 "try start server" run-id) | > > > | | 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 | itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) ;; every 15 minutes verify the server is there for this run (if (and (common:low-noise-print 240 "try start server" run-id) (not (or (and *runremote* (remote-server-url *runremote*) (server:ping (remote-server-url *runremote*))) (server:check-if-running *toppath*)))) (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1))) |
︙ | ︙ |
Modified tasks.scm from [7b85a80157] to [5e421a88af].
︙ | ︙ | |||
193 194 195 196 197 198 199 | (setenv "TARGETHOST_LOGF" (conc logdir "server-kills.log")) (system (conc "nbfake kill "kill-switch" "pid)) (when logfile (thread-sleep! 0.5) (if (file-exists? gzfile) (delete-file gzfile)) | | | 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | (setenv "TARGETHOST_LOGF" (conc logdir "server-kills.log")) (system (conc "nbfake kill "kill-switch" "pid)) (when logfile (thread-sleep! 0.5) (if (file-exists? gzfile) (delete-file gzfile)) (system (conc "gzip " logfile)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")))) ;;====================================================================== ;; M O N I T O R S |
︙ | ︙ |