Changes In Branch private Through [42e2fcd1c8] Excluding Merge-Ins
This is equivalent to a diff from 0044c7f04a to 42e2fcd1c8
2017-03-24
| ||
22:39 | Merged v1.63 changes into v1.64 check-in: b2e6452e2f user: matt tags: v1.64 | |
22:18 | non-good commit of merge multi-server-hack into v1.64 Leaf check-in: 42e2fcd1c8 user: matt tags: private (unpublished) | |
18:22 | Fixed couple issues with the throttle by api load but still not sure it is working right. Cleaned up few other transport things. Closed-Leaf check-in: e7dcebe686 user: matt tags: multi-server-hack | |
2017-03-22
| ||
13:29 | Merged in db-only-on-homehost fix check-in: 0044c7f04a user: matt tags: v1.64 | |
13:29 | Fixed several places where an attempt was being made to open databases in spite on NOT being on homehost. Closed-Leaf check-in: 6e46d4a93e user: matt tags: db-only-on-homehost | |
12:53 | Cleaned out handful of not used globals and added exception handler for dealing with problems when switching log file output port check-in: f61cc5eb98 user: matt tags: v1.64 | |
Modified api.scm from [f1d46841da] to [b8376b33fc].
︙ | ︙ | |||
114 115 116 117 118 119 120 | ;; - keep it simple, only return the actual result of the call, i.e. no meta info here ;; ;; - returns #( flag result ) ;; (define (api:execute-requests dbstruct dat) (handle-exceptions exn | | < | > > | | | | | | | | | 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | ;; - keep it simple, only return the actual result of the call, i.e. no meta info here ;; ;; - returns #( flag result ) ;; (define (api:execute-requests dbstruct dat) (handle-exceptions exn (let ((call-chain (get-call-chain))) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat) (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) 'overloaded) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) (res (if writecmd-in-readonly-mode (conc "attempt to run write command "cmd" on a read-only database") (case cmd ;;=============================================== ;; READ/WRITE QUERIES |
︙ | ︙ | |||
269 270 271 272 273 274 275 | ((ping) (current-process-id)) ((get-changed-record-ids) (apply db:get-changed-record-ids dbstruct params)) ;; TESTMETA ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS | | > > > | > | | | | > | | | | > | > > | 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 | ((ping) (current-process-id)) ((get-changed-record-ids) (apply db:get-changed-record-ids dbstruct params)) ;; TESTMETA ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) ;; TASKS ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))))) ;; save all stats (let ((delta-t (- (current-milliseconds) start-t))) (hash-table-set! *db-api-call-time* cmd (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) (if writecmd-in-readonly-mode (vector #f res) (vector #t res))))))) ;; http-server send-response ;; api:process-request ;; db:* ;; ;; NB// Runs on the server as part of the server loop ;; (define (api:process-request dbstruct $) ;; the $ is the request vars proc (set! *api-process-request-count* (+ *api-process-request-count* 1)) (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) (params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?) (resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) (success (vector-ref resdat 0)) (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) (if (not success) (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) (if (> *api-process-request-count* *max-api-process-requests*) (set! *max-api-process-requests* *api-process-request-count*)) (set! *api-process-request-count* (- *api-process-request-count* 1)) ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds ;; (rmt:dat->json-str ;; (if (or (string? res) ;; (list? res) ;; (number? res) ;; (boolean? res)) ;; res ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) (db:obj->string res transport: 'http))) |
Modified client.scm from [19d242e4dd] to [950fa4a4a2].
︙ | ︙ | |||
76 77 78 79 80 81 82 | (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server") (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; | | | 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server") (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; (let* ((server-dat (server:get-rand-best areapath)) ;; (server:get-first-best areapath)) (runremote (or area-dat *runremote*))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) (port (caddr server-dat))) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if (and (not area-dat) |
︙ | ︙ | |||
106 107 108 109 110 111 112 | (case *transport-type* ((http)(http-transport:close-connections))) (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) (thread-sleep! 1) (client:setup-http areapath remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered | | > < | 106 107 108 109 110 111 112 113 114 115 116 117 118 | (case *transport-type* ((http)(http-transport:close-connections))) (remote-conndat-set! runremote #f) ;; (hash-table-delete! runremote run-id) (thread-sleep! 1) (client:setup-http areapath remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered ;; (server:kind-run areapath) (server:start-and-wait areapath) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) |
Modified common.scm from [0b58394952] to [4b658465e8].
︙ | ︙ | |||
153 154 155 156 157 158 159 | (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) | | > > > | 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) (transport *transport-type*) (server-timeout (or (server:get-timeout) 100)) ;; default to 100 seconds (force-server #f) (ro-mode #f) (ro-mode-checked #f)) ;; flag that indicates we have checked for ro-mode ;; launching and hosts (defstruct host (reachable #f) (last-update 0) (last-used 0) (last-cpuload 1)) |
︙ | ︙ | |||
416 417 418 419 420 421 422 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) | > > > | | | | | | | | | | | | | | | | | > > > > > > > > > > > > > > | | 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 | (read (open-input-string (z3:decode-buffer (base64:base64-decode instr)))))) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; (define (common:simple-file-lock fname #!key (expire-time 300)) (handle-exceptions exn #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail. (if (file-exists? fname) (if (> (- (current-seconds)(file-modification-time fname)) expire-time) (begin (delete-file* fname) (common:simple-file-lock fname expire-time: expire-time)) #f) (let ((key-string (conc (get-host-name) "-" (current-process-id)))) (with-output-to-file fname (lambda () (print key-string))) (thread-sleep! 0.25) (if (file-exists? fname) (with-input-from-file fname (lambda () (equal? key-string (read-line)))) #f))))) (define (common:simple-file-lock-and-wait fname #!key (expire-time 300)) (let ((end-time (+ expire-time (current-seconds)))) (let loop ((got-lock (common:simple-file-lock fname expire-time: expire-time))) (if got-lock #t (if (> end-time (current-seconds)) (begin (thread-sleep! 3) (loop (common:simple-file-lock fname expire-time: expire-time))) #f))))) (define (common:simple-file-release-lock fname) (handle-exceptions exn #f ;; I don't really care why this failed (at least for now) (delete-file* fname))) ;;====================================================================== ;; S T A T E S A N D S T A T U S E S ;;====================================================================== (define *common:std-states* '((0 "ARCHIVED") |
︙ | ︙ | |||
561 562 563 564 565 566 567 | (if res (cadr res)(if (null? default) #f (car default))))) (define (common:get-testsuite-name) (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description. (configf:lookup *configdat* "setup" "testsuite" ) (if (string? *toppath* ) (pathname-file *toppath*) | | | 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 | (if res (cadr res)(if (null? default) #f (car default))))) (define (common:get-testsuite-name) (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description. (configf:lookup *configdat* "setup" "testsuite" ) (if (string? *toppath* ) (pathname-file *toppath*) #f))) ;; (pathname-file (current-directory))))) (define common:get-area-name common:get-testsuite-name) (define (common:get-db-tmp-area) (if *db-cache-path* *db-cache-path* (if *toppath* |
︙ | ︙ | |||
953 954 955 956 957 958 959 | ;; (tags-testpatt ;; (debug:print-info 0 *default-log-port* "-tagexpr "tagexpr" selects testpatt "tags-testpatt) ;; tags-testpatt) ((and (equal? args-testpatt "%") rtestpatt) (debug:print-info 0 *default-log-port* "testpatt defined in "testpatt-key" from runconfigs: " rtestpatt) rtestpatt) (else args-testpatt)))) | | > > > > > > > > > > > > > > > > > > > > > > > > | 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 | ;; (tags-testpatt ;; (debug:print-info 0 *default-log-port* "-tagexpr "tagexpr" selects testpatt "tags-testpatt) ;; tags-testpatt) ((and (equal? args-testpatt "%") rtestpatt) (debug:print-info 0 *default-log-port* "testpatt defined in "testpatt-key" from runconfigs: " rtestpatt) rtestpatt) (else args-testpatt)))) (define (common:false-on-exception thunk #!key (message #f)) (handle-exceptions exn (begin (if message (debug:print-info 0 *default-log-port* message)) #f) (thunk) )) (define (common:file-exists? path-string) ;; this avoids stack dumps in the case where ;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q... (common:false-on-exception (lambda () (file-exists? path-string)) message: (conc "Unable to access path: " path-string) )) (define (common:directory-exists? path-string) ;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q... (common:false-on-exception (lambda () (directory-exists? path-string)) message: (conc "Unable to access path: " path-string) )) (define (common:get-linktree) (or (getenv "MT_LINKTREE") (or (and *configdat* (configf:lookup *configdat* "setup" "linktree")) (if *toppath* (conc *toppath* "/lt") (if (file-exists? "megatest.config") ;; we are in the toppath (new area, mtutils compatible) |
︙ | ︙ | |||
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 | ;; am I on the homehost? ;; (define (common:on-homehost?) (let ((hh (common:get-homehost))) (if hh (cdr hh) #f))) ;; do we honor the caches of the config files? ;; (define (common:use-cache?) (not (or (args:get-arg "-no-cache") (and *configdat* (equal? (configf:lookup *configdat* "setup" "use-cache") "no"))))) | > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 | ;; am I on the homehost? ;; (define (common:on-homehost?) (let ((hh (common:get-homehost))) (if hh (cdr hh) #f))) ;; do we honor the caches of the config files? ;; (define (common:use-cache?) (not (or (args:get-arg "-no-cache") (and *configdat* (equal? (configf:lookup *configdat* "setup" "use-cache") "no"))))) ;; force use of server? ;; (define (common:force-server?) (let* ((force-setting (configf:lookup *configdat* "server" "force")) (force-type (if force-setting (string->symbol force-setting) #f)) (force-result (case force-type ((#f) #f) ((always) #t) ((test) (if (args:get-arg "-execute") ;; we are in a test #t #f)) (else (debug:print 0 *default-log-port* "ERROR: Bad server force setting " force-setting ", forcing server.") #t)))) ;; default to requiring server (if force-result (begin (debug:print-info 0 *default-log-port* "forcing use of server, force setting is \"" force-setting "\".") #t) #f))) ;; do we honor the caches of the config files? ;; (define (common:use-cache?) (not (or (args:get-arg "-no-cache") (and *configdat* (equal? (configf:lookup *configdat* "setup" "use-cache") "no"))))) |
︙ | ︙ |
Modified dashboard.scm from [3869e6c0c0] to [15a7467abe].
︙ | ︙ | |||
2716 2717 2718 2719 2720 2721 2722 | ;; Force creation of the db in case it isn't already there. ;; (tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin | | | 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 | ;; Force creation of the db in case it isn't already there. ;; (tasks:open-db) (define (dashboard:get-youngest-run-db-mod-time dbdir) (handle-exceptions exn (begin (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) (glob (conc dbdir "/*.db*")))))) (define (dashboard:monitor-changed? commondat tabdat) (let* ((run-update-time (current-seconds)) |
︙ | ︙ |
Modified db.scm from [f0fec9ca36] to [8260cfea21].
︙ | ︙ | |||
2100 2101 2102 2103 2104 2105 2106 | (define (db:print-current-query-stats) ;; generate stats from *db-api-call-time* (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*) (lambda (a b) (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a))) (sum-b (common:sum (hash-table-ref *db-api-call-time* b)))) | | > > | > | > | 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 | (define (db:print-current-query-stats) ;; generate stats from *db-api-call-time* (let ((ordered-keys (sort (hash-table-keys *db-api-call-time*) (lambda (a b) (let ((sum-a (common:sum (hash-table-ref *db-api-call-time* a))) (sum-b (common:sum (hash-table-ref *db-api-call-time* b)))) (> sum-a sum-b))))) (total 0)) (for-each (lambda (cmd-key) (let* ((dat (hash-table-ref *db-api-call-time* cmd-key)) (num (length dat)) (avg (if (> num 0) (/ (common:sum dat)(length dat))))) (set! total (+ total num)) (debug:print-info 0 *default-log-port* cmd-key "\tavg: " avg " max: " (common:max dat) " min: " (common:min-max < dat) " num: " (length dat)))) ordered-keys) (debug:print-info 0 *default-log-port* "TOTAL: " total " api calls since start."))) (define (db:get-all-run-ids dbstruct) (db:with-db dbstruct #f #f (lambda (db) |
︙ | ︙ |
Modified http-transport.scm from [d55287325f] to [a7745a1b3a].
︙ | ︙ | |||
264 265 266 267 268 269 270 | (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) (thread-terminate! th2) (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) | | | | 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 | (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) (thread-terminate! th2) (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) (if (vector-ref res 0) ;; this is the first flag or the second flag? res ;; this is the *inner* vector? seriously? why? (if (debug:debug-mode 11) (begin ;; note: this code also called in nmsg-transport - consider consolidating it (debug:print-error 11 *default-log-port* "error occured at server, info=" (vector-ref res 2)) (debug:print 11 *default-log-port* " client call chain:") (print-call-chain (current-error-port)) (debug:print 11 *default-log-port* " server call chain:") (pp (vector-ref res 1) (current-error-port)) |
︙ | ︙ | |||
416 417 418 419 420 421 422 | (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) | | > > > | 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | (set! last-access *db-last-access*) (mutex-unlock! *heartbeat-mutex*) (if (common:low-noise-print 120 (conc "server running on " iface ":" port)) (begin (debug:print 0 *default-log-port* "SERVER STARTED: " iface ":" port " AT " (current-seconds)) (flush-output *default-log-port*))) (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond |
︙ | ︙ |
Modified launch.scm from [7f8f1157fc] to [df0549f95d].
︙ | ︙ | |||
419 420 421 422 423 424 425 426 427 428 429 430 431 432 | ;; (serverinf (assoc/default 'serverinf cmdinfo)) ;; (port (assoc/default 'port cmdinfo)) (serverurl (assoc/default 'serverurl cmdinfo)) (homehost (assoc/default 'homehost cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) (test-id (assoc/default 'test-id cmdinfo)) (target (assoc/default 'target cmdinfo)) (itemdat (assoc/default 'itemdat cmdinfo)) (env-ovrd (assoc/default 'env-ovrd cmdinfo)) (set-vars (assoc/default 'set-vars cmdinfo)) ;; pre-overrides from -setvar (runname (assoc/default 'runname cmdinfo)) (megatest (assoc/default 'megatest cmdinfo)) (runtlim (assoc/default 'runtlim cmdinfo)) (contour (assoc/default 'contour cmdinfo)) | > | 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 | ;; (serverinf (assoc/default 'serverinf cmdinfo)) ;; (port (assoc/default 'port cmdinfo)) (serverurl (assoc/default 'serverurl cmdinfo)) (homehost (assoc/default 'homehost cmdinfo)) (run-id (assoc/default 'run-id cmdinfo)) (test-id (assoc/default 'test-id cmdinfo)) (target (assoc/default 'target cmdinfo)) (areaname (assoc/default 'areaname cmdinfo)) (itemdat (assoc/default 'itemdat cmdinfo)) (env-ovrd (assoc/default 'env-ovrd cmdinfo)) (set-vars (assoc/default 'set-vars cmdinfo)) ;; pre-overrides from -setvar (runname (assoc/default 'runname cmdinfo)) (megatest (assoc/default 'megatest cmdinfo)) (runtlim (assoc/default 'runtlim cmdinfo)) (contour (assoc/default 'contour cmdinfo)) |
︙ | ︙ | |||
443 444 445 446 447 448 449 450 451 452 453 454 455 456 | (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path ) ;; (rollup-status 0) (if contour (setenv "MT_CONTOUR" contour)) ;; On NFS it can be slow and unreliable to get needed startup information. ;; i. Check if we are on the homehost, if so, proceed ;; ii. Check if host and port passed in via CMDINFO are valid and if ;; possible use them. (let ((bestadrs (server:get-best-guess-address (get-host-name))) (needcare #f)) (if (equal? homehost bestadrs) ;; we are likely on the homehost | > > > > > > > | 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 | (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path ) ;; (rollup-status 0) (if contour (setenv "MT_CONTOUR" contour)) ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... ;; (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! *toppath* top-path) (setenv "MT_TEST_RUN_DIR" work-area) ;; On NFS it can be slow and unreliable to get needed startup information. ;; i. Check if we are on the homehost, if so, proceed ;; ii. Check if host and port passed in via CMDINFO are valid and if ;; possible use them. (let ((bestadrs (server:get-best-guess-address (get-host-name))) (needcare #f)) (if (equal? homehost bestadrs) ;; we are likely on the homehost |
︙ | ︙ | |||
465 466 467 468 469 470 471 | (start-res (http-transport:client-connect host port)) (ping-res (rmt:login-no-auto-client-setup start-res))) (if (and start-res ping-res) (let ((url (http-transport:server-dat-make-url start-res))) (remote-conndat-set! *runremote* start-res) (remote-server-url-set! *runremote* url) | > | > > > > | 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 | (start-res (http-transport:client-connect host port)) (ping-res (rmt:login-no-auto-client-setup start-res))) (if (and start-res ping-res) (let ((url (http-transport:server-dat-make-url start-res))) (remote-conndat-set! *runremote* start-res) (remote-server-url-set! *runremote* url) (if (server:ping url) (debug:print-info 0 *default-log-port* "connected to " url " using CMDINFO data.") (begin (debug:print-info 0 *default-log-port* "have CMDINFO data but failed to connect to " url) (remote-conndat-set! *runremote* #f) (remote-server-url-set! *runremote* #f)))) (debug:print-info 0 *default-log-port* "received " host ":" port " for url but could not connect.") )) (begin (debug:print-info 0 *default-log-port* (if host-port (conc "received invalid host-port information " host-port) "no host-port information received")) ;; potential for bad situation if simultaneous starting of hundreds of jobs on servers, set needcare. |
︙ | ︙ | |||
779 780 781 782 783 784 785 | (mutex-unlock! *launch-setup-mutex*) *toppath*) (let ((res (launch:setup-body force: force areapath: areapath))) (mutex-unlock! *launch-setup-mutex*) res))) (define (launch:setup-body #!key (force #f) (areapath #f)) | > > | | | | | | | | | | | | | | | | | | | < < < | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | < < < < | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | < | < < | | | | | < | | 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 | (mutex-unlock! *launch-setup-mutex*) *toppath*) (let ((res (launch:setup-body force: force areapath: areapath))) (mutex-unlock! *launch-setup-mutex*) res))) (define (launch:setup-body #!key (force #f) (areapath #f)) (if (and (eq? *configstatus* 'fulldata) *toppath*) ;; no need to reprocess *toppath* ;; return toppath (let* ((use-cache (common:use-cache?)) (toppath (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath (runname (common:args-get-runname)) (target (common:args-get-target exit-if-bad: #t)) (linktree (common:get-linktree)) (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour")) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (rundir (if (and runname target linktree)(conc linktree "/" target "/" runname) #f)) (mtcachef (and rundir (conc rundir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) (rccachef (and rundir (conc rundir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) (cancreate (and rundir (common:file-exists? rundir)(file-write-access? rundir)))) ;; (cxt (hash-table-ref/default *contexts* toppath #f))) ;; create our cxt for this area if it doesn't already exist ;; (if (not cxt)(hash-table-set! *contexts* toppath (make-cxt))) ;; (print "runname: " runname " target: " target " mtcachef: " mtcachef " rccachef: " rccachef) (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource (cond ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME ((and mtcachef (common:file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache) (set! *configdat* (configf:read-alist mtcachef)) (set! *runconfigdat* (configf:read-alist rccachef)) (set! *configinfo* (list *configdat* (get-environment-variable "MT_RUN_AREA_HOME"))) (set! *configstatus* 'fulldata) (set! *toppath* (get-environment-variable "MT_RUN_AREA_HOME")) *toppath*) ;; we have all the info needed to fully process runconfigs and megatest.config (mtcachef (let* ((first-pass (find-and-read-config ;; NB// sets MT_RUN_AREA_HOME as side effect mtconfig environ-patt: "env-override" given-toppath: toppath pathenvvar: "MT_RUN_AREA_HOME")) (first-rundat (let ((toppath (if toppath toppath (car first-pass)))) (read-config ;; (conc toppath "/runconfigs.config") ;; this should be converted to runconfig:read but it is non-trivial, leaving it for now. (conc (if (string? toppath) toppath (get-environment-variable "MT_RUN_AREA_HOME")) "/runconfigs.config") *runconfigdat* #t sections: sections)))) (set! *runconfigdat* first-rundat) (if first-pass ;; (begin (set! *configdat* (car first-pass)) (set! *configinfo* first-pass) (set! *toppath* (or toppath (cadr first-pass))) ;; use the gathered data unless already have it (set! toppath *toppath*) (if (not *toppath*) (begin (debug:print-error 0 *default-log-port* "you are not in a megatest area!") (exit 1))) (setenv "MT_RUN_AREA_HOME" *toppath*) ;; the seed read is done, now read runconfigs, cache it then read megatest.config one more time and cache it (let* ((keys (rmt:get-keys)) (key-vals (keys:target->keyval keys target)) (linktree (common:get-linktree)) ;; (or (getenv "MT_LINKTREE")(if *configdat* (configf:lookup *configdat* "setup" "linktree") #f))) (second-pass (find-and-read-config mtconfig environ-patt: "env-override" given-toppath: toppath pathenvvar: "MT_RUN_AREA_HOME")) (runconfigdat (begin ;; this read of the runconfigs will see any adjustments made by re-reading megatest.config (for-each (lambda (kt) (setenv (car kt) (cadr kt))) key-vals) (read-config (conc toppath "/runconfigs.config") *runconfigdat* #t ;; consider using runconfig:read some day ... sections: sections)))) (if cancreate (configf:write-alist runconfigdat rccachef)) (set! *runconfigdat* runconfigdat) (if cancreate (configf:write-alist *configdat* mtcachef)) (if cancreate (set! *configstatus* 'fulldata)))) ;; no configs found? should not happen but let's try to recover gracefully, return an empty hash-table (set! *configdat* (make-hash-table)) ))) ;; end of mtcachef ;; else read what you can and set the flag accordingly (else ;;; else clause for cond (let* ((cfgdat (find-and-read-config (or (args:get-arg "-config") "megatest.config") environ-patt: "env-override" given-toppath: (get-environment-variable "MT_RUN_AREA_HOME") pathenvvar: "MT_RUN_AREA_HOME"))) (if cfgdat (let* ((toppath (or (get-environment-variable "MT_RUN_AREA_HOME")(cadr cfgdat))) (rdat (read-config (conc toppath ;; convert this to use runconfig:read! "/runconfigs.config") *runconfigdat* #t sections: sections))) (set! *configinfo* cfgdat) (set! *configdat* (car cfgdat)) (set! *runconfigdat* rdat) (set! *toppath* toppath) (set! *configstatus* 'partial)) (begin (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") (exit 2)))))) ;;; cond ends here ;; additional house keeping (let* ((linktree (common:get-linktree))) (if linktree (begin (if (not (common:file-exists? linktree)) (begin (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Something went wrong when trying to create linktree dir at " linktree) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (exit 1)) (create-directory linktree #t)))) (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) (let ((tlink (conc *toppath* "/lt"))) (if (not (file-exists? tlink)) (create-symbolic-link linktree tlink))))) (begin ;; no link tree spec? (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") ))) (if (and *toppath* (directory-exists? *toppath*)) (begin (setenv "MT_RUN_AREA_HOME" *toppath*) (setenv "MT_TESTSUITENAME" (common:get-testsuite-name))) (begin (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.") (set! *toppath* #f))) ;; force it to be false so we return #f ;; if have -append-config then read and append here (let ((cfname (args:get-arg "-append-config"))) (if (and cfname (file-read-access? cfname)) (read-config cfname *configdat* #t)))))) ;; values are added to the hash, no need to do anything special. (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) (string->number (or m "10000"))))) (if disks (let ((res (common:get-disk-with-most-free-space disks minspace))) ;; min size of 1000, seems tad dumb |
︙ | ︙ | |||
992 993 994 995 996 997 998 | (lnktarget (conc lnkpath "/" item-path))) ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id) (debug:print 2 *default-log-port* "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) | | | | | 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 | (lnktarget (conc lnkpath "/" item-path))) ;; Update the rundir path in the test record for all, rundir=physical, shortdir=logical ;; rundir shortdir (rmt:general-call 'test-set-rundir-shortdir run-id lnkpathf test-path testname item-path run-id) (debug:print 2 *default-log-port* "INFO:\n lnkbase=" lnkbase "\n lnkpath=" lnkpath "\n toptest-path=" toptest-path "\n test-path=" test-path) (if (not (common:file-exists? linktree)) (begin (debug:print 0 *default-log-port* "WARNING: linktree did not exist! Creating it now at " linktree) (create-directory linktree #t))) ;; (system (conc "mkdir -p " linktree)))) ;; create the directory for the tests dir links, this is needed no matter what... (if (and (not (common:directory-exists? lnkbase)) (not (common:file-exists? lnkbase))) (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* "Problem creating linktree base at " lnkbase) (print-error-message exn (current-error-port))) (create-directory lnkbase #t))) |
︙ | ︙ | |||
1240 1241 1242 1243 1244 1245 1246 | ;; (list 'serverinf *server-info*) (list 'homehost (let* ((hhdat (common:get-homehost))) (if hhdat (car hhdat) #f))) (list 'serverurl (if *runremote* (remote-server-url *runremote*) | | > | 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 | ;; (list 'serverinf *server-info*) (list 'homehost (let* ((hhdat (common:get-homehost))) (if hhdat (car hhdat) #f))) (list 'serverurl (if *runremote* (remote-server-url *runremote*) #f)) ;; (list 'areaname (common:get-testsuite-name)) (list 'toppath *toppath*) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript) (list 'run-id run-id ) (list 'test-id test-id ) ;; (list 'item-path item-path ) |
︙ | ︙ |
Modified rmt.scm from [5167aff935] to [f051a84a44].
︙ | ︙ | |||
53 54 55 56 57 58 59 | ;; do all the prep locked under the rmt-mutex (mutex-lock! *rmt-mutex*) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; | | | > > > > > | | | > > > > > > | 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | ;; do all the prep locked under the rmt-mutex (mutex-lock! *rmt-mutex*) ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas (runremote (or area-dat *runremote*)) (readonly-mode (if (and runremote (remote-ro-mode-checked runremote)) (remote-ro-mode runremote) (let* ((dbfile (conc *toppath* "/megatest.db")) (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future (if runremote (begin (remote-ro-mode-set! runremote ro-mode) (remote-ro-mode-checked-set! runremote #t) ro-mode) ro-mode))))) ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond ;; give up if more than 15 attempts ((> attemptnum 15) (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") (exit 1)) |
︙ | ︙ | |||
91 92 93 94 95 96 97 | (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area | | | > | > | | > | > > | > > > | | > | > > | | 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 | (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 30)))) ;; add 30 seconds of noise so that not all running tests expire at the same time causing a storm of server starts (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (remote-conndat-set! runremote #f) (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a record for our connection for given area ((not runremote) ;; can remove this one. should never get here. (set! *runremote* (make-remote)) ;; new runremote will come from this on next iteration (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; ensure we have a homehost record ((not (pair? (remote-hh-dat runremote))) ;; not on homehost (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little (remote-hh-dat-set! runremote (common:get-homehost)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a read ((and (not (remote-force-server runremote)) ;; honor forced use of server (cdr (remote-hh-dat runremote)) ;; on homehost (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url runremote) ;; have a server (not (server:check-if-running *toppath*))) ;; server has died. (set! *runremote* (make-remote)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a write, we already have a server ((and (not (remote-force-server runremote)) ;; honor forced use of server (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url runremote)) ;; have a server (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost, no server contact made and this is a write, passively start a server ((and (not (remote-force-server runremote)) ;; honor forced use of server (cdr (remote-hh-dat runremote)) ;; new (not (remote-server-url runremote)) (not (member cmd api:read-only-queries))) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed (if (common:force-server?) (server:start-and-wait *toppath*) (server:kind-run *toppath*)))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") (rmt:open-qry-close-locally cmd 0 params)) ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one (not (remote-conndat runremote))) (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost (not (remote-conndat runremote)))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? (server:start-and-wait *toppath*)) (remote-force-server-set! runremote (common:force-server?)) (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query ((and (not (remote-force-server runremote)) (cdr (remote-hh-dat runremote))) ;; we are on homehost (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) ;; not on homehost, do server query (else (mutex-unlock! *rmt-mutex*) |
︙ | ︙ | |||
172 173 174 175 176 177 178 | (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time ;; (mutex-unlock! *rmt-mutex*) | | < < < | > | | > | < > | > | | | | 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 | (else (debug:print 0 *default-log-port* "ERROR: transport " (remote-transport runremote) " not supported") (exit)))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (symbol? res) (eq? res 'overloaded)) (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! runremote #f) (remote-server-url-set! runremote #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") (if (not (server:check-if-running *toppath*)) (server:start-and-wait *toppath*)) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions ;; exn ;; (begin |
︙ | ︙ |
Modified server.scm from [a31b48f74d] to [2e218ee65f].
︙ | ︙ | |||
138 139 140 141 142 143 144 | (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds ;; (define (server:logf-get-start-info logf) (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs | > > > | | | | | | | | | | | | | | | | < > | | < > | | 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds ;; (define (server:logf-get-start-info logf) (let ((rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+)"))) ;; SERVER STARTED: host:port AT timesecs (handle-exceptions exn (list #f #f #f) ;; no idea what went wrong, call it a bad server (with-input-from-file logf (lambda () (let loop ((inl (read-line)) (lnum 0)) (if (not (eof-object? inl)) (let ((mlst (string-match rx inl))) (if (not mlst) (if (< lnum 500) ;; give up if more than 500 lines of server log read (loop (read-line)(+ lnum 1)) (list #f #f #f)) (let ((dat (cdr mlst))) (list (car dat) ;; host (string->number (cadr dat)) ;; port (string->number (caddr dat)))))) (list #f #f #f)))))))) ;; get a list of servers with all relevant data ;; ( mod-time host port start-time pid ) ;; (define (server:get-list areapath #!key (limit #f)) (let ((fname-rx (regexp "^(|.*/)server-(\\d+)-(\\S+).log$")) (day-seconds (* 24 60 60))) ;; if the directory exists continue to get the list ;; otherwise attempt to create the logs dir and then ;; continue (if (if (directory-exists? (conc areapath "/logs")) '() (if (file-write-access? areapath) (begin (condition-case (create-directory (conc areapath "/logs") #t) (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs"))) (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list."))) (directory-exists? (conc areapath "/logs"))) '())) (let* ((server-logs (glob (conc areapath "/logs/server-*.log"))) (num-serv-logs (length server-logs))) (if (null? server-logs) '() (let loop ((hed (car server-logs)) (tal (cdr server-logs)) (res '())) (let* ((mod-time (handle-exceptions exn (current-seconds) ;; 0 (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted (down-time (- (current-seconds) mod-time)) (serv-dat (if (or (< num-serv-logs 10) (< down-time day-seconds)) (server:logf-get-start-info hed) '())) ;; don't waste time processing server files not touched in the past day if there are more than ten servers to look at (serv-rec (cons mod-time serv-dat)) (fmatch (string-match fname-rx hed)) |
︙ | ︙ | |||
216 217 218 219 220 221 222 | ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) | > > | | | | | | | | | | | > > > > > > > > > > > > | | | | | | | | | > | > > | > > | < < < < < < < < < | > > | > > | > > > | 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 | ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) (let ((now (current-seconds))) (sort (filter (lambda (rec) (if (and (list? rec) (> (length rec) 2)) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 0) ;; been running at least 0 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds (< (- now start-time) (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) 180) (random 360))) ;; under one hour running time +/- 180 )) #f)) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) (define (server:get-first-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) (define (server:get-rand-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and (list? srvrs) (not (null? srvrs))) (let* ((len (length srvrs)) (idx (random len))) (list-ref srvrs idx)) #f))) (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) (conc host ":" port) #f))) (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun (call-num (car last-run-dat)) (when-run (cadr last-run-dat)) (run-delay (+ (case call-num ((0) 0) ((1) 20) ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (begin (common:simple-file-lock-and-wait lock-file expire-time: 15) (server:run areapath) (thread-sleep! 5) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-url (server:check-if-running areapath))) (if (or server-url (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. server-url (let ((num-ok (length (server:get-best (server:get-list areapath))))) (if (< num-ok 1) ;; if there are no decent candidates for servers then try starting a new one (server:kind-run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath #!key (numservers "2")) (let* ((ns (string->number (or (configf:lookup *configdat* "server" "numservers") numservers))) (servers (server:get-best (server:get-list areapath)))) ;; (print "servers: " servers " ns: " ns) (if (or (and servers (null? servers)) (not servers) (and (list? servers) (< (length servers) (random ns)))) ;; somewhere between 0 and numservers #f (let loop ((hed (car servers)) (tal (cdr servers))) (let ((res (server:check-server hed))) (if res res (if (null? tal) |
︙ | ︙ |
Modified tests.scm from [bccd138868] to [897e386321].
︙ | ︙ | |||
484 485 486 487 488 489 490 | (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)) (common:simple-file-release-lock lockf) (change-directory orig-dir) ;; NB// tests:test-set-toplog! is remote internal... (tests:test-set-toplog! run-id test-name outputfilename)) ;; didn't get the lock, check to see if current update started later than this ;; update, if so we can exit without doing any work | > > > | | 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 | (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename)) (common:simple-file-release-lock lockf) (change-directory orig-dir) ;; NB// tests:test-set-toplog! is remote internal... (tests:test-set-toplog! run-id test-name outputfilename)) ;; didn't get the lock, check to see if current update started later than this ;; update, if so we can exit without doing any work (if (> my-start-time (handle-exceptions exn 0 (file-modification-time lockf))) ;; we started since current re-gen in flight, delay a little and try again (begin (debug:print-info 1 *default-log-port* "Waiting to update " outputfilename ", another test currently updating it") (thread-sleep! (+ 5 (random 5))) ;; delay between 5 and 10 seconds (loop (common:simple-file-lock lockf)))))))))) (define (tests:generate-html-summary-for-iterated-test run-id test-id test-name outputfilename) |
︙ | ︙ |