Overview
Comment: | Some more tweaks and output reduction. Still get crashes due to db lock but system seems to keep going pretty well. This is with 300 tests running on one machine. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.70-nohomehost |
Files: | files | file ages | folders |
SHA1: |
4dcb84418f9518463e8fce6e7143d904 |
User & Date: | matt on 2022-11-22 08:59:16 |
Other Links: | branch diff | manifest | tags |
Context
2022-11-22
| ||
09:06 | Turn the handler for opening server info files back on since those files can disappear without warning. Closed-Leaf check-in: abb42df5ef user: matt tags: v1.70-nohomehost | |
08:59 | Some more tweaks and output reduction. Still get crashes due to db lock but system seems to keep going pretty well. This is with 300 tests running on one machine. check-in: 4dcb84418f user: matt tags: v1.70-nohomehost | |
07:49 | Keep more servers around but let them also expire quickly if not used check-in: c0ef1c5bfa user: matt tags: v1.70-nohomehost | |
Changes
Modified client.scm from [6f5dc4a64f] to [2a6738b25e].
︙ | ︙ | |||
42 43 44 45 46 47 48 | ;; Not currently used! But, I think it *should* be used!!! #;(define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) | < < < < < < < < | < < < < | 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | ;; Not currently used! But, I think it *should* be used!!! #;(define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) (define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) (client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) ;; Do all the connection work, look up the transport type and set up the ;; connection if required. ;; ;; There are two scenarios. ;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline ;; 2. We are a run tests, list runs or other interactive process and we must figure out |
︙ | ︙ |
Modified dbfile.scm from [e95b97d328] to [d24edd08a7].
︙ | ︙ | |||
1013 1014 1015 1016 1017 1018 1019 | (if (file-exists? jfile) (begin (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load") (thread-sleep! 0.2))) (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (dbfile:print-err *api-process-request-count* " parallel api requests being processed in process " | | | 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 | (if (file-exists? jfile) (begin (dbfile:print-err "INFO: "jfile" exists, delaying to reduce database load") (thread-sleep! 0.2))) (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (dbfile:print-err *api-process-request-count* " parallel api requests being processed in process " (current-process-id))) ;; ", throttling access")) (condition-case (begin (if use-mutex (mutex-lock! *db-with-db-mutex*)) (let ((res (apply proc dbdat db params))) ;; the actual call is here. (if use-mutex (mutex-unlock! *db-with-db-mutex*)) ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w)) (if dbdat |
︙ | ︙ |
Modified http-transport.scm from [c23198333f] to [84bebe0a7c].
︙ | ︙ | |||
498 499 500 501 502 503 504 | (if (not server-going) ;; *dbstruct-dbs* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! (set! server-going #t) (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. (if (and no-sync-db | | > | | 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 | (if (not server-going) ;; *dbstruct-dbs* (begin (debug:print 0 *default-log-port* "SERVER: dbprep") (set! *dbstruct-dbs* (db:setup #t)) ;; run-id)) FIXME!!! (set! server-going #t) (debug:print 0 *default-log-port* "SERVER: running, megatest version: " (common:get-full-version))) ;; NOTE: the server is NOT yet marked as running in the log. We do that in the keep-running routine. (if (and no-sync-db (common:low-noise-print 10 "sync-all")) ;; cheesy way to reduce frequency of running sync :) (begin (if (common:low-noise-print 120 "sync-all-print") (debug:print 0 *default-log-port* "keep-running calling db:all-db-sync at " (time->string (seconds->local-time) "%H:%M:%S"))) (db:all-db-sync *dbstruct-dbs*)))) ;; when things go wrong we don't want to be doing the various queries too often ;; so we strive to run this stuff only every four seconds or so. (let* ((sync-time (- (current-milliseconds) start-time)) (rem-time (quotient (- 4000 sync-time) 1000))) (if (and (<= rem-time 4) |
︙ | ︙ | |||
556 557 558 559 560 561 562 | (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn | | | > | | > > > > | 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 | (> (+ last-access server-timeout) (current-seconds))) (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on info file " servinfofile ". Are you out of space on that disk? exn=" exn) (if (and (< (- (current-seconds) server-start-time) 600) ;; run for ten minutes for experiment, 3600 thereafter (not *server-overloaded*) (file-exists? servinfofile)) (change-file-times servinfofile curr-time curr-time))) (if (or (common:low-noise-print 120 "start new server") (> *api-process-request-count* 50)) ;; if this server is kind of busy start up another (begin (debug:print-info 0 *default-log-port* "Server is busy, parallel-api-count "*api-process-request-count*", start another if possible...") (server:kind-run *toppath*) (if (> *api-process-request-count* 100) (begin (debug:print-info 0 *default-log-port* "Server is overloaded at parallel-api-count="*api-process-request-count*", removing "servinfofile) (delete-file* servinfofile))))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) (define (http-transport:server-shutdown port) (begin |
︙ | ︙ |
Modified server.scm from [69e932d5ac] to [90ee47bed7].
︙ | ︙ | |||
666 667 668 669 670 671 672 | ;; Default is 60 seconds. ;; (define (server:expiration-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below (* 3600 (string->number tmo)) | | | 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 | ;; Default is 60 seconds. ;; (define (server:expiration-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (common:hms-string->seconds tmo)) ;; BUG: hms-string->seconds is broken, if given "10" returns 0. Also, it doesn't belong in this logic unless the string->number is changed below (* 3600 (string->number tmo)) 60))) (define (server:get-best-guess-address hostname) (let ((res #f)) (for-each (lambda (adr) (if (not (eq? (u8vector-ref adr 0) 127)) (set! res adr))) |
︙ | ︙ |