Overview
Comment: | load control is working but servers are still getting wedged over time |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.80 |
Files: | files | file ages | folders |
SHA1: |
9771b5d5a97110ffe8a2cc963916648d |
User & Date: | matt on 2023-04-10 06:09:34 |
Other Links: | branch diff | manifest | tags |
Context
2023-04-10
| ||
07:58 | fixed start up wedging check-in: ce4cc8997a user: matt tags: v1.80 | |
06:09 | load control is working but servers are still getting wedged over time check-in: 9771b5d5a9 user: matt tags: v1.80 | |
2023-04-09
| ||
22:14 | flag loaded at 50 threads. check-in: 4c1e85ecfb user: matt tags: v1.80 | |
Changes
Modified tcp-transportmod.scm from [bee5aadbcc] to [f7ad6026cc].
︙ | ︙ | |||
229 230 231 232 233 234 235 | (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is overloaded, will try again in "result" seconds.") (thread-sleep! (if (number? result) result 2)) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) ((loaded) | | | | 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 | (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is overloaded, will try again in "result" seconds.") (thread-sleep! (if (number? result) result 2)) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) ((loaded) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.") (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) (else ;; did not receive properly formated result (if (not res) ;; tt:handler is telling us that communication failed (let* ((host (tt-conn-host conn)) (port (tt-conn-port conn)) |
︙ | ︙ | |||
345 346 347 348 349 350 351 | (let* ((host-port (conc host":"port)) (bkoff (hash-table-ref/default *tt:backoff-smoothing* host-port #f))) (if bkoff (let* ((wait-delay (tt:backoff-wait-delay bkoff)) (last-ioerr (tt:backoff-last-ioerr bkoff)) (last-adj-t (tt:backoff-last-adj-t bkoff)) (delta (- (current-seconds) last-adj-t)) | | > | | 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 | (let* ((host-port (conc host":"port)) (bkoff (hash-table-ref/default *tt:backoff-smoothing* host-port #f))) (if bkoff (let* ((wait-delay (tt:backoff-wait-delay bkoff)) (last-ioerr (tt:backoff-last-ioerr bkoff)) (last-adj-t (tt:backoff-last-adj-t bkoff)) (delta (- (current-seconds) last-adj-t)) (adj (* delta 0.001)) ;; it takes 100 seconds to recover from hitting an io err (new-wait (if (> wait-delay 0) (if (> adj wait-delay) 0 (- wait-delay adj)) 0))) (if (> new-wait 0) (begin (if (common:low-noise-print 10 "delay wait message") (debug:print-info 0 *default-log-port* "Server loaded, DelayWait: "new-wait)) (tt:backoff-wait-delay-set! bkoff new-wait) (tt:backoff-last-adj-t-set! bkoff (current-seconds)) (thread-sleep! new-wait)) (hash-table-delete! *tt:backoff-smoothing* host-port)))))) (define (tt:send-receive-direct host port dat #!key (ping-mode #f)(tries-remaining 25)) (assert (number? port) "FATAL: tt:send-receive-direct called with port not a number "port) |
︙ | ︙ | |||
399 400 401 402 403 404 405 | (full-err-print exn "ERROR: i/o error") (tt:backoff-incr host port) #f) (exn (i/o net) (if ping-mode #f (if (>= tries-remaining 0) | | | 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 | (full-err-print exn "ERROR: i/o error") (tt:backoff-incr host port) #f) (exn (i/o net) (if ping-mode #f (if (>= tries-remaining 0) (let* ((backoff-delay (* (- 26 tries-remaining) 0.1))) (debug:print 0 *default-log-port* "WARNING: TCP overload, trying again in "backoff-delay"s.") (thread-sleep! backoff-delay) (tt:backoff-incr host port) (retry)) (assert #f "FATAL: Too many retries in tt:send-receive-direct")))) (exn () (full-err-print exn "Unhandled exception from client side.") |
︙ | ︙ |
Modified utils/load-the-db.scm from [46853b5895] to [92b9fb2b93].
1 2 3 4 5 6 | ;; start the repl and then load this file (define start-time (current-seconds)) (let loop ((last-print 0) (num-calls 0)) | | | | > > > > > > > > > > | > | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | ;; start the repl and then load this file (define start-time (current-seconds)) (let loop ((last-print 0) (num-calls 0)) (let* ((all-run-ids (rmt:get-all-run-ids)) (do-print (> (- (current-seconds) last-print) 2)) (max-query 0) (num-calls (+ num-calls 1 ;; account for call above (length all-run-ids) ;; account for the get-tests-for-run in the for-each below ))) (for-each (lambda (run-id) ;; (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) (let* ((all-run-data (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f #f #f))) (set! num-calls (+ num-calls (length all-run-data))) (for-each (lambda (testdat) (let* ((test-id (vector-ref testdat 0)) (start-at (current-milliseconds)) (testinfo (rmt:get-test-info-by-id run-id test-id)) (query-time (- (current-milliseconds) start-at))) (if (> query-time max-query) (set! max-query query-time)))) all-run-data) (if do-print (let* ((run-time (- (current-seconds) start-time)) (qry-rate (if (> run-time 0) (inexact->exact (round (/ num-calls run-time))) -1))) (print "Running "run-time"s, run "run-id " has "(length all-run-data)" tests, max query "max-query "ms with avg query rate "qry-rate" qry/s"))))) all-run-ids) (loop (if do-print (current-seconds) last-print) num-calls))) |