Overview
Comment: | Changed sleep in kind-run from 2 to 18 seconds to allow server to come up to the point where get-list will find it. Cleaned up messages and comments. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65 |
Files: | files | file ages | folders |
SHA1: |
2f8513eec5ac5e7583c5213a30f23230 |
User & Date: | mmgraham on 2021-03-25 11:26:01 |
Other Links: | branch diff | manifest | tags |
Context
2021-03-25
| ||
17:35 | changed server start lock file expire-time from 15 to 25 seconds check-in: 169d94de4e user: mmgraham tags: v1.65, v1.6584 | |
11:26 | Changed sleep in kind-run from 2 to 18 seconds to allow server to come up to the point where get-list will find it. Cleaned up messages and comments. check-in: 2f8513eec5 user: mmgraham tags: v1.65 | |
2021-03-13
| ||
00:34 | restored the configf cleanup, with workaround for ext.scm check-in: 316c988564 user: mmgraham tags: v1.65 | |
Changes
Modified configf.scm from [f97d938f06] to [d727d54436].
︙ | ︙ | |||
146 147 148 149 150 151 152 | (lambda () (set! result ((eval (read)) ht)))) (set! result (conc "#{(" cmdtype ") " cmd "}")))) (case cmdsym ((system shell scheme) (let ((delta (- (current-seconds) start-time))) (if (> delta 2) | | | 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | (lambda () (set! result ((eval (read)) ht)))) (set! result (conc "#{(" cmdtype ") " cmd "}")))) (case cmdsym ((system shell scheme) (let ((delta (- (current-seconds) start-time))) (if (> delta 2) (debug:print-info 2 *default-log-port* "for line \"" l "\"\n command: " cmd " took " delta " seconds to run with output:\n " result) (debug:print-info 9 *default-log-port* "for line \"" l "\"\n command: " cmd " took " delta " seconds to run with output:\n " result))))) (loop (conc prestr result poststr))) res)) res))) ;; Run a shell command and return the output as a string (define (shell cmd) |
︙ | ︙ |
Modified runs.scm from [2583922f1c] to [07f8912d41].
︙ | ︙ | |||
499 500 501 502 503 504 505 506 507 508 509 510 511 512 | ;; runs:run-tests is called from megatest.scm and itself ;;====================================================================== ;; ;; test-names: Comma separated patterns same as test-patts but used in selection ;; of tests to run. The item portions are not respected. ;; FIXME: error out if /patt specified ;; (define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names (let* ((keys (keys:config-get-fields *configdat*)) (keyvals (keys:target->keyval keys target)) (run-id (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour"))) ;; test-name))) ;; (deferred '()) ;; delay running these since they have a waiton clause (runconfigf (conc *toppath* "/runconfigs.config")) (dbfile (conc *toppath* "/megatest.db")) | > | 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 | ;; runs:run-tests is called from megatest.scm and itself ;;====================================================================== ;; ;; test-names: Comma separated patterns same as test-patts but used in selection ;; of tests to run. The item portions are not respected. ;; FIXME: error out if /patt specified ;; ;; run-count is passed from megatest.scm as configf:lookup *configdat* "setup" "reruns", or defaults to 1. (define (runs:run-tests target runname test-patts user flags #!key (run-count 1)) ;; test-names (let* ((keys (keys:config-get-fields *configdat*)) (keyvals (keys:target->keyval keys target)) (run-id (rmt:register-run keyvals runname "new" "n/a" user (args:get-arg "-contour"))) ;; test-name))) ;; (deferred '()) ;; delay running these since they have a waiton clause (runconfigf (conc *toppath* "/runconfigs.config")) (dbfile (conc *toppath* "/megatest.db")) |
︙ | ︙ | |||
622 623 624 625 626 627 628 | ;; (set! required-tests (lset-intersection equal? test-names all-test-names)) ;; look up all tests matching the comma separated list of globs in ;; test-patts (using % as wildcard) ;; (set! test-names (delete-duplicates (tests:get-valid-tests *toppath* test-patts))) (debug:print-info 0 *default-log-port* "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " ")) | | | 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 | ;; (set! required-tests (lset-intersection equal? test-names all-test-names)) ;; look up all tests matching the comma separated list of globs in ;; test-patts (using % as wildcard) ;; (set! test-names (delete-duplicates (tests:get-valid-tests *toppath* test-patts))) (debug:print-info 0 *default-log-port* "tests search path: " (string-intersperse (tests:get-tests-search-path *configdat*) " ")) (debug:print-info 2 *default-log-port* "all tests: " (string-intersperse (sort all-test-names string<) " ")) (debug:print-info 0 *default-log-port* "test names: " (string-intersperse (sort test-names string<) " ")) (debug:print-info 0 *default-log-port* "required tests: " (string-intersperse (sort required-tests string<) " ")) ;; on the first pass or call to run-tests set FAILS to NOT_STARTED if ;; -keepgoing is specified (if (eq? *passnum* 0) (begin |
︙ | ︙ | |||
2391 2392 2393 2394 2395 2396 2397 | ;; seek and kill in flight -runtests with % as testpatt here ;; (if (equal? testpatt "%") (tasks:kill-runner target run-name testpatt) ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) | | | 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 | ;; seek and kill in flight -runtests with % as testpatt here ;; (if (equal? testpatt "%") (tasks:kill-runner target run-name testpatt) ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt)) (debug:print 1 *default-log-port* "Removing tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((set-state-status) ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) (debug:print 1 *default-log-port* "Modifying state and status for tests for run: " runkey " " (db:get-value-by-header run header "runname"))) ((print-run) (debug:print 1 *default-log-port* "Printing info for run " runkey ", run=" run ", tests=" tests ", header=" header) action) ((run-wait) (debug:print 1 *default-log-port* "Waiting for run " runkey ", run=" runnamepatt " to complete")) ((archive) (debug:print 1 *default-log-port* "Archiving/restoring (" (args:get-arg "-archive") ") data for run: " runkey " " (db:get-value-by-header run header "runname")) |
︙ | ︙ |
Modified server.scm from [19fd15cc14] to [c7b3629e26].
︙ | ︙ | |||
109 110 111 112 113 114 115 | ((rpc) (db:obj->string (vector success/fail query-sig result))) ((http) (db:obj->string (vector success/fail query-sig result))) ((fs) result) (else (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) result))) | | | | 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | ((rpc) (db:obj->string (vector success/fail query-sig result))) ((http) (db:obj->string (vector success/fail query-sig result))) ((fs) result) (else (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) result))) ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run areapath) ;; areapath is *toppath* for a given testsuite area (let* ((curr-host (get-host-name)) ;; (attempt-in-progress (server:start-attempted? areapath)) ;; (dot-server-url (server:check-if-running areapath)) |
︙ | ︙ | |||
153 154 155 156 157 158 159 | (not (equal? curr-ip target-host))) (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time | | < | | > > | 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 | (not (equal? curr-ip target-host))) (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds server-id ;; any changes to number of elements returned by this fuction will dirctly affect server:record->url,server:record->id,server:kill,server:get-num-alive which use match let ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 (define (server:logf-get-start-info logf) (let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id (dbprep-rx (regexp "^SERVER: dbprep")) (dbprep-found 0)) (handle-exceptions exn |
︙ | ︙ | |||
199 200 201 202 203 204 205 | (list (car dat) ;; host (string->number (cadr dat)) ;; port (string->number (caddr dat)) (cadr (cddr dat)))))) (begin (if dbprep-found (begin | | | | | 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 | (list (car dat) ;; host (string->number (cadr dat)) ;; port (string->number (caddr dat)) (cadr (cddr dat)))))) (begin (if dbprep-found (begin (debug:print-info 0 *default-log-port* "Server is in dbprep at " (common:human-time)) (thread-sleep! 0.5) ;; was 25 sec but that blocked things from starting? ) (debug:print-info 0 *default-log-port* "Unable to get server info from " logf " at " (seconds->time-string (current-seconds))) ) (list #f #f #f #f))))))))) ;; get a list of servers from the log files, with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then |
︙ | ︙ | |||
232 233 234 235 236 237 238 | ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited. (let* ((server-logs-cmd (conc "grep -iL exiting " areapath "/logs/server-*-*.log")) (server-logs (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all)))) (num-serv-logs (length server-logs))) (if (or (null? server-logs) (= num-serv-logs 0)) (let () | | | 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | ;; Get the list of server logs that do not contain "exiting". Ignore logs for servers that have exited. (let* ((server-logs-cmd (conc "grep -iL exiting " areapath "/logs/server-*-*.log")) (server-logs (string-split (string-chomp (call-with-input-pipe server-logs-cmd read-all)))) (num-serv-logs (length server-logs))) (if (or (null? server-logs) (= num-serv-logs 0)) (let () (debug:print 1 *default-log-port* "There are no servers running at " (common:human-time)) '() ) (let loop ((hed (string-chomp (car server-logs))) (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions exn |
︙ | ︙ | |||
363 364 365 366 367 368 369 | (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) | | | | > > > > > | > > > > > | | | | | < < < < < < < < < < | < | > | < > > | 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 | (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) ;; if server-start-last exists, overwrite it. Otherwise loop recursively until it is old enough. ;; (define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (all-go (> delta idletime)) (old-server-key (with-input-from-file start-flag (lambda () (read-line)))) ) ;; write a new start-flag file, wait 0.25s, then if the previous start-flag file was older than <idletime> seconds, and the new file still has the same server key as you just wrote, return #t. ;; (if (and all-go (begin (debug:print-info 0 *default-log-port* "Writing " start-flag) (with-output-to-file start-flag (lambda () (print server-key))) (thread-sleep! 0.25) (let ((res (with-input-from-file start-flag (lambda () (read-line))))) (equal? server-key res))) ) #t ;; (system (conc "touch " start-flag)) ;; lazy but safe ;; If either of the above conditions are not true, print a "Gating server start" message, wait <idle-time>, then call this function recursively. (begin (debug:print-info 0 *default-log-port* "Gating server start, last start: " (seconds->time-string fmodtime) ", time since last start: " delta ", required idletime: " idletime ", gating reason:" (if all-go "server key does not match" "too soon to start another server")) (debug:print-info 0 *default-log-port* "server keys: from file: " old-server-key " needed: " server-key) (thread-sleep! idletime) (server:wait-for-server-start-last-flag areapath))))))) ;; kind start up of servers, wait before allowing another server for a given ;; area to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least <server idletime> seconds old (server:wait-for-server-start-last-flag areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ( (lock-file (conc areapath "/logs/server-start.lock"))) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (common:simple-file-lock-and-wait lock-file expire-time: 15) (debug:print-info 0 *default-log-port* "server:kind-run: touching " start-flag) (system (conc "touch " start-flag)) ;; lazy but safe (server:run areapath) (thread-sleep! 18) ;; don't release the lock for at least a few seconds. And allow time for the server startup to get to "SERVER STARTED". (common:simple-file-release-lock lock-file))) (debug:print-info 0 *default-log-port* "Found server already running. NOT trying to start another.") ) ) ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-info (server:check-if-running areapath)) (try-num 0)) |
︙ | ︙ |