Overview
Comment: | Merged in side2 |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65 |
Files: | files | file ages | folders |
SHA1: |
a33d7eb0bee5fad17bd4b71ad0e16552 |
User & Date: | matt on 2020-10-17 23:06:20 |
Other Links: | branch diff | manifest | tags |
Context
2020-10-18
| ||
17:51 | switch some first-result to fold-rows on hunch there is an issue. check-in: 392230a9e9 user: matt tags: v1.65 | |
2020-10-17
| ||
23:06 | Merged in side2 check-in: a33d7eb0be user: matt tags: v1.65 | |
00:46 | Added speculative dynamically calculated delay to reduce db hammering check-in: 41eeb0e45f user: matt tags: v1.65 | |
2020-10-13
| ||
22:17 | Added delay that *may* help server starts Leaf check-in: 06179e8f43 user: mrwellan tags: v1.65-side2 | |
Changes
Modified api.scm from [4fa67bb6bd] to [cc4c2bfc8f].
︙ | ︙ | |||
152 153 154 155 156 157 158 | ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else | | | | > > | 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (common:safe-vector-ref dat 0 'nocmd)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) (params (common:safe-vector-ref dat 1 '())) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) (foo (begin #;(common:telemetry-log (conc "api-in:"(->string cmd)) payload: `((params . ,params))) #t)) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== ((nocmd) '(#f "All broken!")) ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl ;; SERVERS ((start-server) (apply server:kind-run params)) ((kill-server) (set! *server-run* #f)) |
︙ | ︙ | |||
357 358 359 360 361 362 363 | ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin | | | | | | 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 | ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (begin #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #t))) (vector #f res)) (begin #;(common:telemetry-log (conc "api-out:"(->string cmd)) payload: `((params . ,params) (ok-res . #f))) (vector #t res)))))))) ;; http-server send-response ;; api:process-request ;; db:* ;; ;; NB// Runs on the server as part of the server loop ;; (define (api:process-request dbstruct $) ;; the $ is the request vars proc (set! *api-process-request-count* (+ *api-process-request-count* 1)) (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) (params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?) (resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) (success (common:safe-vector-ref resdat 0 #f)) (res (common:safe-vector-ref resdat 1 #f))) ;; (vector flag payload), get the payload, ignore the flag (why?) (if (not success) (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) (if (> *api-process-request-count* *max-api-process-requests*) (set! *max-api-process-requests* *api-process-request-count*)) (set! *api-process-request-count* (- *api-process-request-count* 1)) ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds ;; (rmt:dat->json-str |
︙ | ︙ |
Modified common.scm from [7199c76f8e] to [3f88276a39].
︙ | ︙ | |||
486 487 488 489 490 491 492 | ;; copy <file>.hrs.gz <file>.days.gz (when (>= (age-wks daysfile) 1) (copy daysfile wksfile) (copy hrsfile daysfile)) #t) #f)) | > > > > > > > > > | | 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 | ;; copy <file>.hrs.gz <file>.days.gz (when (>= (age-wks daysfile) 1) (copy daysfile wksfile) (copy hrsfile daysfile)) #t) #f)) (define (common:safe-vector-ref vec indx default) (if (vector? vec) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "remote data issue: exn=" exn) default) (vector-ref vec indx)) default)) ;; Rotate logs, logic: ;; if > 500k and older than 1 week: ;; remove previous compressed log and compress this log ;; WARNING: This proc operates assuming that it is in the directory above the ;; logs directory you wish to log-rotate. ;; |
︙ | ︙ |
Modified configf.scm from [b115fef76f] to [83ecc5b24c].
︙ | ︙ | |||
779 780 781 782 783 784 785 | ht)) ;; if (define (configf:read-alist fname) (handle-exceptions exn (begin | | | 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 | ht)) ;; if (define (configf:read-alist fname) (handle-exceptions exn (begin (debug:print-info 0 *default-log-port* "unable to read alist " fname ". exn=" exn) #f) (configf:alist->config (with-input-from-file fname read)))) (define (configf:write-alist cdat fname) (if (not (common:faux-lock fname)) (debug:print 0 *default-log-port* "INFO: Could not get lock on " fname)) |
︙ | ︙ |
Modified docs/manual/megatest_manual.html from [a02a70016f] to [00d3df112f].
︙ | ︙ | |||
2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 | <div class="listingblock"> <div class="content monospaced"> <pre>[setup] # this will automatically kill the test if it runs for more than 1h 2m and 3s runtimelim 1h 2m 3s</pre> </div></div> </div> </div> </div> <div class="sect2"> <h3 id="_tests_browser_view">Tests browser view</h3> <div class="paragraph"><p>The tests browser (see the Run Control tab on the dashboard) has two views for displaying the tests.</p></div> <div class="olist arabic"><ol class="arabic"> <li> | > > > > > > > > > > > > > > > | 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 | <div class="listingblock"> <div class="content monospaced"> <pre>[setup] # this will automatically kill the test if it runs for more than 1h 2m and 3s runtimelim 1h 2m 3s</pre> </div></div> </div> <div class="sect4"> <h5 id="_post_run_hook">Post Run Hook</h5> <div class="paragraph"><p>This runs script to-run.sh after all tests have been completed. It is not necessary to use -run-wait as each test will check for other running tests on completion and if there are none it will call the post run hook.</p></div> <div class="paragraph"><p>Note that the output from the script call will be placed in a log file in the logs directory with a file name derived by replacing / with _ in post-hook-<target>-<runname>.log.</p></div> <div class="listingblock"> <div class="content monospaced"> <pre>[runs] post-hook /path/to/script/to-run.sh</pre> </div></div> </div> </div> </div> <div class="sect2"> <h3 id="_tests_browser_view">Tests browser view</h3> <div class="paragraph"><p>The tests browser (see the Run Control tab on the dashboard) has two views for displaying the tests.</p></div> <div class="olist arabic"><ol class="arabic"> <li> |
︙ | ︙ | |||
3482 3483 3484 3485 3486 3487 3488 | </div> </div> </div> <div id="footnotes"><hr></div> <div id="footer"> <div id="footer-text"> Version 1.5<br> | | | 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 | </div> </div> </div> <div id="footnotes"><hr></div> <div id="footer"> <div id="footer-text"> Version 1.5<br> Last updated 2020-10-12 20:12:01 PDT </div> </div> </body> </html> |
Modified docs/manual/reference.txt from [6aa04b6eea] to [530f6e150c].
︙ | ︙ | |||
203 204 205 206 207 208 209 210 211 212 213 214 215 216 | ----------------- [setup] # this will automatically kill the test if it runs for more than 1h 2m and 3s runtimelim 1h 2m 3s ----------------- Tests browser view ~~~~~~~~~~~~~~~~~~ The tests browser (see the Run Control tab on the dashboard) has two views for displaying the tests. . Dot (graphviz) based tree . No dot, plain listing | > > > > > > > > > > > > > > > > > | 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 | ----------------- [setup] # this will automatically kill the test if it runs for more than 1h 2m and 3s runtimelim 1h 2m 3s ----------------- Post Run Hook +++++++++++++ This runs script to-run.sh after all tests have been completed. It is not necessary to use -run-wait as each test will check for other running tests on completion and if there are none it will call the post run hook. Note that the output from the script call will be placed in a log file in the logs directory with a file name derived by replacing / with _ in post-hook-<target>-<runname>.log. ------------------- [runs] post-hook /path/to/script/to-run.sh ------------------- Tests browser view ~~~~~~~~~~~~~~~~~~ The tests browser (see the Run Control tab on the dashboard) has two views for displaying the tests. . Dot (graphviz) based tree . No dot, plain listing |
︙ | ︙ |
Modified launch.scm from [e8093b3e63] to [fd65f376ed].
︙ | ︙ | |||
769 770 771 772 773 774 775 | (item-path (vector-ref running-test 11))) (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) (define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) | > > | | 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 | (item-path (vector-ref running-test 11))) (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) (define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) (let* ((is-local (equal? host (get-host-name))) (ssh-cmd (if is-local " " (conc "ssh " host " "))) (cmd (conc ssh-cmd "pstree -A " pid)) (output (with-input-from-pipe cmd read-lines))) (debug:print 2 *default-log-port* "Running " cmd " received " output) (if (eq? (length output) 0) #f #t)) #t)) |
︙ | ︙ |
Modified rmt.scm from [342d9c7e08] to [95448ebca4].
︙ | ︙ | |||
52 53 54 55 56 57 58 59 60 61 62 63 64 65 | cinfo (if (server:check-if-running areapath) (client:setup areapath) #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected #;(common:telemetry-log (conc "rmt:"(->string cmd)) payload: `((rid . ,rid) (params . ,params))) | > > > > > > > > > > > > > > > > > > > > > | | 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | cinfo (if (server:check-if-running areapath) (client:setup areapath) #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id (define *rmt-query-last-call-time* 0) (define *rmt-query-last-rest-time* 0) ;; last time there was at least a 1/2 second rest - giving other processes access to the db ;; NOTE: This query rest algorythm will not adapt to long query times. REDESIGN NEEDED. TODO. FIXME. ;; (define (rmt:query-rest) (let* ((now (current-milliseconds))) (cond ((> (- now *rmt-query-last-call-time*) 500) ;; it's been a while since last query - no need to rest (set! *rmt-query-last-rest-time* now) (set! *rmt-query-last-call-time* now)) ((> (- now *rmt-query-last-rest-time*) 5000) ;; no natural rests have happened (debug:print 0 *default-log-port* "query rest needed. blocking for 1/2 second.") (thread-sleep! 0.5) ;; force a rest of a half second (set! *rmt-query-last-rest-time* now) (set! *rmt-query-last-call-time* now)) (else ;; sufficient rests have occurred, just record the last query time (set! *rmt-query-last-call-time* now))))) ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected #;(common:telemetry-log (conc "rmt:"(->string cmd)) payload: `((rid . ,rid) (params . ,params))) (if (not (equal? (configf:lookup *configdat* "setup" "query-rest") "no")) (rmt:query-rest)) (if (> attemptnum 2) (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) (cond ((> attemptnum 2) (thread-sleep! 0.05)) ((> attemptnum 10) (thread-sleep! 0.5)) ((> attemptnum 20) (thread-sleep! 1))) |
︙ | ︙ | |||
367 368 369 370 371 372 373 | (vector #t '())) ;; should always get a vector but if something goes wrong return a dummy (if (and (vector? v) (> (vector-length v) 1)) (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record (vector #t '())))) ;; we could also check that the returned types are valid (vector #t '()))) | | | | | 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | (vector #t '())) ;; should always get a vector but if something goes wrong return a dummy (if (and (vector? v) (> (vector-length v) 1)) (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record (vector #t '())))) ;; we could also check that the returned types are valid (vector #t '()))) (success (common:safe-vector-ref resdat 0 #f)) ;; (vector-ref resdat 0)) (res (common:safe-vector-ref resdat 1 #f)) ;; (vector-ref resdat 1)) (duration (- (current-milliseconds) start))) (if (and read-only qry-is-write) (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd)) (if (not success) (if (> remretries 0) (begin (debug:print-error 0 *default-log-port* "local query failed. Trying again.") (thread-sleep! (/ (random 5000) 1000)) ;; some random delay (rmt:open-qry-close-locally cmd run-id params remretries: (- remretries 1))) (begin (debug:print-error 0 *default-log-port* "too many retries in rmt:open-qry-close-locally, giving up") #f)) (begin ;; (rmt:update-db-stats run-id cmd params duration) ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it (if qry-is-write (let ((start-time (current-seconds))) (mutex-lock! *db-multi-sync-mutex*) (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) (mutex-unlock! *db-multi-sync-mutex*))))) res)) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) (res (handle-exceptions exn |
︙ | ︙ |
Modified runs.scm from [24707790d1] to [8568995d03].
︙ | ︙ | |||
1236 1237 1238 1239 1240 1241 1242 | ;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. | | | 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 | ;; If no resources are available just kill time and loop again ;; ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. (thread-sleep! 1) ;; changed back to 1 from 0.25 ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test ;; ((and have-resources (or (null? prereqs-not-met) |
︙ | ︙ |
Modified server.scm from [7b2af2dc7e] to [3f245d9ff9].
︙ | ︙ | |||
324 325 326 327 328 329 330 331 332 333 334 335 336 337 | ;; wait for server=start-last to be three seconds old ;; (define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) (reftime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (all-go (> delta reftime))) (if (and all-go (begin (with-output-to-file start-flag | > | 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 | ;; wait for server=start-last to be three seconds old ;; (define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) (reftime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (thread-sleep! (/ (random 500) 1000)) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (all-go (> delta reftime))) (if (and all-go (begin (with-output-to-file start-flag |
︙ | ︙ |