Overview
Comment: | Chipping away at server issues |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.80-revolution |
Files: | files | file ages | folders |
SHA1: |
30862628e21f9b71e076a79a7c7d98b3 |
User & Date: | mrwellan on 2023-11-20 19:18:55 |
Other Links: | branch diff | manifest | tags |
Context
2023-11-21
| ||
15:42 | Lots of little changes check-in: fa88f0abd7 user: mrwellan tags: v1.80-revolution | |
2023-11-20
| ||
19:18 | Chipping away at server issues check-in: 30862628e2 user: mrwellan tags: v1.80-revolution | |
2023-11-17
| ||
20:16 | small tweaks - not there yet check-in: 8a3f889655 user: matt tags: v1.80-revolution | |
Changes
Modified api.scm from [f8a9578235] to [7a64d5e9a2].
︙ | ︙ | |||
33 34 35 36 37 38 39 | (import debugprint) (import tcp-transportmod) (use srfi-69 srfi-18 posix matchable | | > | 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | (import debugprint) (import tcp-transportmod) (use srfi-69 srfi-18 posix matchable s11n typed-records) ;; allow these queries through without starting a server ;; (define api:read-only-queries '(get-key-val-pairs get-var get-keys |
︙ | ︙ | |||
150 151 152 153 154 155 156 | ;; TASKS tasks-add tasks-set-state-given-param-key )) (define *db-write-mutexes* (make-hash-table)) (define *server-signature* #f) | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | > > > > > > > > > > > > > > | < | | | | | | | | | | | | | | | | | | | > | | | | | < | | | > | > > | | | | | | | | | | | | | | | | | | | | | | | | > > > > > | | > | | | | > | | | | | | | | | > | 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | ;; TASKS tasks-add tasks-set-state-given-param-key )) (define *db-write-mutexes* (make-hash-table)) (define *server-signature* #f) (define *api-threads* '()) (define (api:register-thread th-in) (set! *api-threads* (cons (cons th-in (current-seconds)) *api-threads*))) (define (api:unregister-thread th-in) (set! *api-threads* (filter (lambda (thdat) (not (eq? th-in (car thdat)))) *api-threads*))) (define (api:remove-dead-or-terminated) (set! *api-threads* (filter (lambda (thdat) (not (member (thread-state (car thdat)) '(terminated dead)))) *api-threads*))) (define (api:get-count-threads-alive) (length *api-threads*)) (define *api:last-stats-print* 0) (define *api-print-db-stats-mutex* (make-mutex)) (define (api:print-db-stats) (debug:print-info 0 *default-log-port* "Started periodic db stats printer") (let loop () (mutex-lock! *api-print-db-stats-mutex*) (if (> (- (current-seconds) *api:last-stats-print*) 15) (begin (rmt:print-db-stats) (set! *api:last-stats-print* (current-seconds)))) (mutex-unlock! *api-print-db-stats-mutex*) (thread-sleep! 5) (loop))) ;; indat is (cmd run-id params meta) ;; ;; WARNING: Do not print anything in the lambda of this function as it ;; reads/writes to current in/out port ;; (define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params) (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.") (if (not *server-signature*) (set! *server-signature* (tt:mk-signature *toppath*))) (lambda (indat) (api:register-thread (current-thread)) (let* ((result (let* ((numthreads (api:get-count-threads-alive)) (delay-wait (if (> numthreads 10) (- numthreads 10) 0)) (normal-proc (lambda (cmd run-id params) (case cmd ((ping) *server-signature*) (else (api:dispatch-request dbstruct cmd run-id params)))))) (set! *api-process-request-count* numthreads) (set! *db-last-access* (current-seconds)) ;; (if (not (eq? numthreads numthreads)) ;; (begin ;; (api:remove-dead-or-terminated) ;; (let ((threads-now (api:get-count-threads-alive))) ;; (debug:print 0 *default-log-port* "WARNING: numthreads="numthreads", numthreads="numthreads", remaining="threads-now) ;; (set! numthreads threads-now)))) (match indat ((cmd run-id params meta) (let* ((start-t (current-milliseconds)) (db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id)) (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct)))) (case cmd ((ping) #t) ;; we are fine (else (assert ok "FATAL: database file and run-id not aligned."))))) (ttdat *server-info*) (server-state (tt-state ttdat)) (maxthreads 10) ;; make this a parameter? (status (cond ((and (> numthreads maxthreads) (> (random 100) 70)) ;; allow a 30% probability to go through so we can figure out what is going wrong in main.db server. 'busy) ;; ((> numthreads 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down. (else 'ok))) (errmsg (case status ((busy) (conc "Server overloaded, "numthreads" threads in flight")) ((loaded) (conc "Server loaded, "numthreads" threads in flight")) (else #f))) (result (case status ((busy) (if (eq? cmd 'ping) (normal-proc cmd run-id params) ;; numthreads must be greater than 5 for busy (* 1 (- numthreads maxthreads)) ;; was 15 )) ;; (- numthreads 29)) ;; call back in as many seconds ((loaded) ;; (if (eq? (rmt:transport-mode) 'tcp) ;; (thread-sleep! 0.5)) (normal-proc cmd run-id params)) (else (normal-proc cmd run-id params)))) (meta (case cmd ((ping) `((sstate . ,server-state))) (else `((wait . ,delay-wait))))) (payload (list status errmsg result meta))) ;; (cmd run-id params meta) (db:add-stats cmd run-id params (- (current-milliseconds) start-t)) payload)) (else (assert #f "FATAL: failed to deserialize indat "indat)))))) ;; (set! *api-process-request-count* (- *api-process-request-count* 1)) ;; (serialize payload) (api:unregister-thread (current-thread)) result))) (define *api-halt-writes* #f) (define (api:dispatch-request dbstruct cmd run-id params) (if (not *no-sync-db*) (db:open-no-sync-db)) (let* ((start-time (current-milliseconds))) (if (member cmd api:write-queries) (let loop () (if *api-halt-writes* (begin (thread-sleep! 0.2) (if (< (- (current-milliseconds) start-time) 5000) ;; hope it don't take more than five seconds to sync (loop-time) #;(debug:print 0 *default-log-port* "ERROR: writes halted for more than 5 seconds, sync might be taking too long")))))) (db:add-stats 'api-write-blocking-for-sync run-id params (- (current-milliseconds) start-time))) (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl |
︙ | ︙ | |||
521 522 523 524 525 526 527 | ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))) | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 478 479 480 481 482 483 484 | ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))) |
Modified common.scm from [2e7dd411a7] to [adf24d701b].
︙ | ︙ | |||
152 153 154 155 156 157 158 | ;; (define *common:denoise* (make-hash-table)) ;; for low noise printing (define *default-log-port* (current-error-port)) (define *time-zero* (current-seconds)) ;; for the watchdog (define *on-exit-procs* '()) ;; add procs to this list to be executed on exit (define *default-area-tag* "local") ;; DATABASE | < < < < | 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | ;; (define *common:denoise* (make-hash-table)) ;; for low noise printing (define *default-log-port* (current-error-port)) (define *time-zero* (current-seconds)) ;; for the watchdog (define *on-exit-procs* '()) ;; add procs to this list to be executed on exit (define *default-area-tag* "local") ;; DATABASE ;; db access (define *db-last-access* (current-seconds)) ;; last db access, used in server ;; (define *db-write-access* #t) ;; db sync ;; (define *db-last-sync* 0) ;; last time the sync to megatest.db happened (define *db-sync-in-progress* #f) ;; if there is a sync in progress do not try to start another ;; (define *db-multi-sync-mutex* (make-mutex)) ;; protect access to *db-sync-in-progress*, *db-last-sync* |
︙ | ︙ |
Modified db.scm from [b1837f1312] to [4cfe34e3d4].
︙ | ︙ | |||
3719 3720 3721 3722 3723 3724 3725 3726 3727 | (delete-duplicates (cons testname (hash-table-ref/default res tag '()))))) tags))) db "SELECT testname,tags FROM test_meta") (hash-table->alist res))))) ;; read the record given a testname (define (db:testmeta-get-record dbstruct testname) | > > > > > > > > | | | | | | | | | | | | > > | | 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 | (delete-duplicates (cons testname (hash-table-ref/default res tag '()))))) tags))) db "SELECT testname,tags FROM test_meta") (hash-table->alist res))))) ;; testmeta doesn't change, we can cache it for up too an hour (define *db:testmeta-cache* (make-hash-table)) (define *db:testmeta-last-update* 0) ;; read the record given a testname (define (db:testmeta-get-record dbstruct testname) (if (and (< (- (current-seconds) *db:testmeta-last-update*) 600) (hash-table-exists? *db:testmeta-cache* testname)) (hash-table-ref *db:testmeta-cache* testname) (let ((res #f)) (db:with-db dbstruct #f #f (lambda (dbdat db) (sqlite3:for-each-row (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup) (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup))) db "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;" testname))) (hash-table-set! *db:testmeta-cache* testname res) (set! *db:testmeta-last-update* (current-seconds)) res))) ;; create a new record for a given testname (define (db:testmeta-add-record dbstruct testname) (db:with-db dbstruct #f #t (lambda (dbdat db) (sqlite3:execute db |
︙ | ︙ | |||
4516 4517 4518 4519 4520 4521 4522 | (if (not *time-to-exit*) (loop)))) ;; ;; time to exit, close the no-sync db here ;; (db:no-sync-close-db no-sync-db stmt-cache) (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) )))) )) | < | 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 | (if (not *time-to-exit*) (loop)))) ;; ;; time to exit, close the no-sync db here ;; (db:no-sync-close-db no-sync-db stmt-cache) (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) )))) )) (define (std-exit-procedure) ;;(common:telemetry-log-close) (on-exit (lambda () 0)) ;; why is this here? ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up #f |
︙ | ︙ |
Modified dbmod.scm from [bc19f724a5] to [39c3e50bc5].
︙ | ︙ | |||
29 30 31 32 33 34 35 36 37 38 39 40 41 42 | (import scheme chicken data-structures extras files (prefix sqlite3 sqlite3:) matchable posix typed-records srfi-1 srfi-18 srfi-69 | > | 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | (import scheme chicken data-structures extras files format (prefix sqlite3 sqlite3:) matchable posix typed-records srfi-1 srfi-18 srfi-69 |
︙ | ︙ | |||
825 826 827 828 829 830 831 832 833 834 835 836 837 838 | (sdb (dbmod:safely-open-db src-db init-proc #t)) (ddb (dbmod:safely-open-db dest-db init-proc d-wr)) (res (dbmod:sync-gasket tables last-update sdb ddb dest-db 'todisk keys))) (sqlite3:finalize! sdb) (sqlite3:finalize! ddb) res))) #f)) ) ;; ATTIC #;(let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log")) (sync-cmd (if (eq? syncdir 'todisk) | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 | (sdb (dbmod:safely-open-db src-db init-proc #t)) (ddb (dbmod:safely-open-db dest-db init-proc d-wr)) (res (dbmod:sync-gasket tables last-update sdb ddb dest-db 'todisk keys))) (sqlite3:finalize! sdb) (sqlite3:finalize! ddb) res))) #f)) ;; ====================================================================== ;; dbstats ;;====================================================================== ;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this. ;; db stats (define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total > (define *db-stats-mutex* (make-mutex)) (define (rmt:print-db-stats) (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" (debug:print 0 *default-log-port* "DB Stats\n========") (debug:print 0 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) (for-each (lambda (cmd) (let* ((dat (hash-table-ref *db-stats* cmd)) (count (dbstat-cnt dat)) (tottime (dbstat-tottime dat))) (debug:print 0 *default-log-port* (format #f fmtstr cmd count tottime (/ tottime count))))) (sort (hash-table-keys *db-stats*) (lambda (a b) (> (dbstat-tottime (hash-table-ref *db-stats* a)) (dbstat-tottime (hash-table-ref *db-stats* b)))))))) (defstruct dbstat (cnt 0) (tottime 0)) (define (db:add-stats cmd run-id params delta) (let* ((modified-cmd (if (eq? cmd 'general-call) (string->symbol (conc "general-call-" (car params))) cmd)) (rec (hash-table-ref/default *db-stats* modified-cmd #f))) (if (not rec) (let ((new-rec (make-dbstat))) (hash-table-set! *db-stats* modified-cmd new-rec) (set! rec new-rec))) (dbstat-cnt-set! rec (+ (dbstat-cnt rec) 1)) (dbstat-tottime-set! rec (+ (dbstat-tottime rec) delta)))) ) ;; ATTIC #;(let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log")) (sync-cmd (if (eq? syncdir 'todisk) |
︙ | ︙ |
Modified megatest.scm from [d482239d49] to [4f7c279d10].
︙ | ︙ | |||
968 969 970 971 972 973 974 975 976 977 978 979 980 981 | (tl (launch:setup)) (keys (keys:config-get-fields *configdat*))) (case (rmt:transport-mode) ((tcp) (let* ((timeout (server:expiration-timeout))) (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout) (tt-server-timeout-param timeout) (if dbfname (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys) (begin (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.") (exit 1))))) (else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode)))) (set! *didsomething* #t))) | > | 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 | (tl (launch:setup)) (keys (keys:config-get-fields *configdat*))) (case (rmt:transport-mode) ((tcp) (let* ((timeout (server:expiration-timeout))) (debug:print 0 *default-log-port* "INFO: Starting server for " dbfname " using tcp method with server timeout of "timeout) (tt-server-timeout-param timeout) (thread-start! (make-thread api:print-db-stats "print-db-stats")) (if dbfname (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys) (begin (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.") (exit 1))))) (else (debug:print 0 *default-log-port* "ERROR: rmt:transport-mode value not recognised "(rmt:transport-mode)))) (set! *didsomething* #t))) |
︙ | ︙ |
Modified rmt.scm from [ca8b91d9f3] to [c3f010a183].
︙ | ︙ | |||
112 113 114 115 116 117 118 | ;; KEEP THIS HERE FOR A WHILE, WE MAY WANT TO RESURECT IT ;; (define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) ;; (let* ((keys (common:get-fields *configdat*)) ;; (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard"))) ;; (api:dispatch-request dbstruct cmd run-id params))) | < < < < < < < < < < < < | 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | ;; KEEP THIS HERE FOR A WHILE, WE MAY WANT TO RESURECT IT ;; (define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) ;; (let* ((keys (common:get-fields *configdat*)) ;; (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard"))) ;; (api:dispatch-request dbstruct cmd run-id params))) (define (rmt:get-max-query-average run-id) (mutex-lock! *db-stats-mutex*) (let* ((runkey (conc "run-id=" run-id " ")) (cmds (filter (lambda (x) (substring-index runkey x)) (hash-table-keys *db-stats*))) (res (if (null? cmds) |
︙ | ︙ |
Modified runs.scm from [77337ff0b0] to [66517b0c3e].
︙ | ︙ | |||
2062 2063 2064 2065 2066 2067 2068 | ;; Here is where the test_meta table is best updated ;; Yes, another use of a global for caching. Need a better way? ;; ;; There is now a single call to runs:update-all-test_meta and this ;; per-test call is not needed. Given the delicacy of the move to ;; v1.55 this code is being left in place for the time being. ;; | | | 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 | ;; Here is where the test_meta table is best updated ;; Yes, another use of a global for caching. Need a better way? ;; ;; There is now a single call to runs:update-all-test_meta and this ;; per-test call is not needed. Given the delicacy of the move to ;; v1.55 this code is being left in place for the time being. ;; (if (not (hash-table-exists? *test-meta-updated* test-name)) (begin (hash-table-set! *test-meta-updated* test-name #t) (runs:update-test_meta test-name test-conf))) ;; itemdat => ((ripeness "overripe") (temperature "cool") (season "summer")) (let* ((new-test-path (string-intersperse (cons test-path (map cadr itemdat)) "/")) (test-id (rmt:get-test-id run-id test-name item-path)) |
︙ | ︙ |
Modified tcp-transportmod.scm from [b10f8d79d9] to [d161e3f81c].
︙ | ︙ | |||
253 254 255 256 257 258 259 | (if (and (number? delay-wait) (> delay-wait 0)) (begin (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay | | > | | 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | (if (and (number? delay-wait) (> delay-wait 0)) (begin (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay (let* ((raw-dly (if (number? result) result 0.1)) (dly (* raw-dly (/ attemptnum 2)))) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1)) (thread-sleep! dly) (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))) ((loaded) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.") (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)) (else |
︙ | ︙ |