Overview
Comment: | Reworked load handling and added fast gate on server starts |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-cleanup |
Files: | files | file ages | folders |
SHA1: |
4556f5829a05a745de075aaf8703a852 |
User & Date: | matt on 2020-08-25 23:36:51 |
Other Links: | branch diff | manifest | tags |
Context
2020-08-26
| ||
15:56 | Tweaked server throttle. Bumped version. check-in: fc0f14e78b user: mrwellan tags: v1.65-cleanup, v1.6565 | |
2020-08-25
| ||
23:36 | Reworked load handling and added fast gate on server starts check-in: 4556f5829a user: matt tags: v1.65-cleanup | |
15:54 | Merged v1.65 with get-intercept and get-delay ==/8.7/1.0/WARN/1201/mars/== check-in: b0705815be user: mrwellan tags: v1.65-cleanup | |
Changes
Modified common.scm from [b89a44911d] to [cbb700a53d].
︙ | ︙ | |||
1748 1749 1750 1751 1752 1753 1754 1755 | (debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed. exn=" exn) path) ;; just give up (with-input-from-pipe (conc "/bin/readlink -f " path) (lambda () (read-line))))) (define (common:get-intercept onemin fivemin) | > > > > | | | | | > > > > | < | 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 | (debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed. exn=" exn) path) ;; just give up (with-input-from-pipe (conc "/bin/readlink -f " path) (lambda () (read-line))))) ;; returns *effective load* (not normalized) ;; (define (common:get-intercept onemin fivemin) (if (< onemin fivemin) ;; load is decreasing, just use the onemin load onemin (let* ((load-change (- onemin fivemin)) (tchange (- 300 60))) (max (+ onemin (* 60 (/ load-change tchange))) 0)))) ;; calculate a delay number based on a droop curve ;; inputs are: ;; - load-in, load as from uptime, NOT normalized ;; - numcpus, number of cpus, ideally use the real cpus, not threads ;; (define (common:get-delay load-in numcpus) (max (/ (expt 5 (* 4 (/ load-in numcpus))) 10) 0)) (define (get-cpu-load #!key (remote-host #f)) (car (common:get-cpu-load remote-host))) ;; (let* ((load-res (process:cmd-run->list "uptime")) ;; (load-rx (regexp "load average:\\s+(\\d+)")) ;; (cpu-load #f)) |
︙ | ︙ | |||
2060 2061 2062 2063 2064 2065 2066 | (if new-best (begin ;; found a host, return it (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) | | | | 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 | (if new-best (begin ;; found a host, return it (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate) (host-last-used-set! rec curr-time) new-best) (if (null? tal) #f (loop (car tal)(cdr tal) best-host))))))))) (define (common:wait-for-homehost-load maxnormload msg) (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. #f (common:get-homehost))) (hh (if hh-dat (car hh-dat) #f))) (common:wait-for-normalized-load maxnormload msg hh))) (define (common:get-num-cpus remote-host) (let* ((actual-host (or remote-host (get-host-name)))) (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often! (let* ((proc (lambda () (let loop ((numcpu 0) (inl (read-line))) |
︙ | ︙ | |||
2093 2094 2095 2096 2097 2098 2099 | (if (and (number? result) (> result 0)) (common:write-cached-info actual-host "num-cpus" result)) result)))) ;; wait for normalized cpu load to drop below maxload ;; | | | | > > | > > | | > | | < < < < < | | | > | | > | | < < < | < < < < | | < < < > | > > > > | > | > > > > | > | | > | | | | | | > > > | < | < > | | < < < < | | > > | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 | (if (and (number? result) (> result 0)) (common:write-cached-info actual-host "num-cpus" result)) result)))) ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5)) (let ((num-cpus (common:get-num-cpus remote-host))) (if num-cpus (common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host) (begin (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again (if (> rem-tries 0) (common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1)) #f))))) ;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load ;; count - count down to zero, at some point we'd give up if the load never drops ;; num-tries - count down to zero number tries to get numcpus ;; (define (common:wait-for-cpuload maxnormload numcpus-in #!key (count 1000) (msg #f)(remote-host #f)(num-tries 5)) (let* ((loadavg (common:get-cpu-load remote-host)) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again (numcpus (if (<= 1 numcpus-in) (common:get-num-cpus remote-host) numcpus-in)) (first (car loadavg)) (next (cadr loadavg)) (adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug ;; where numcpus ;; (or could be ;; maxload) is ;; zero, crude ;; fallback is to ;; at least use 1 ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit ;; etc. (effective-load (common:get-intercept first next)) (recommended-delay (common:get-delay effective-load numcpus)) (effective-host (or remote-host "localhost")) (normalized-effective-load (/ effective-load numcpus)) (will-wait (> normalized-effective-load maxnormload))) (if (> recommended-delay 0) (let* ((actual-delay (min recommended-delay 30))) (debug:print-info 0 *default-log-port* "Load is high, delaying " actual-delay " seconds.") (thread-sleep! actual-delay))) (cond ;; bad data, try again to get the data ((not will-wait) (if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host)) (debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing."))) ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable (> num-tries 0)) (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") (thread-sleep! 10) (common:wait-for-cpuload maxnormload numcpus-in count: count remote-host: remote-host num-tries: (- num-tries 1))) ;; need to wait for load to drop ((and will-wait ;; (> first adjmaxload) (> count 0)) (debug:print-info 0 *default-log-port* "Delaying 15" ;; adjwait " seconds due to normalized effective load " normalized-effective-load ;; first " exceeding max of " adjmaxload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxnormload ") " (if msg msg "")) (thread-sleep! 15) ;; adjwait) (common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host) ;; put the message here to indicate came out of waiting (debug:print-info 1 *default-log-port* "On host: " effective-host ", effective load: " effective-load ", numcpus: " numcpus ", normalized effective load: " normalized-effective-load )) ;; overloaded and count expired (i.e. went to zero) (else (if (> num-tries 0) ;; should be "num-tries-left". (if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host)) (debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of " effective-normalized-load " continuing.")) (debug:print 0 *default-log-port* "Load on " effective-host ", " first" could not be retrieved. Giving up and continuing.")))))) ;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load ;; ;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5)) ;; (let* ((loadavg (common:get-cpu-load remote-host)) ;; (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again ;; (common:get-num-cpus remote-host) ;; numcpus-in)) ;; (maxload (if force-maxload ;; maxload-in ;; (if (number? maxload-in) ;; (max maxload-in 0.5) ;; 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? ;; (first (car loadavg)) ;; (next (cadr loadavg)) ;; (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where ;; ;; numcpus (or could be ;; ;; maxload) is zero, ;; ;; crude fallback is to ;; ;; at least use 1 ;; (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next? ;; 0 ;; next))) ;; we will force a conservative calculation any time next is large. ;; (first-next-avg (/ (+ first next) 2)) ;; ;; add some randomness to the time to break any alignment ;; ;; where netbatch dumps many jobs to machines simultaneously ;; (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10) ;; (/ (- 1000 count) 10) ;; waitdelay) ;; (- first adjmaxload) )))) ;; (load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit")) ;; ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit ;; ;; etc. ;; (effective-load (common:get-intercept first next)) ;; (effective-host (or remote-host "localhost")) ;; (normalized-effective-load (/ effective-load numcpus)) ;; (will-wait (> normalized-effective-load maxload))) ;; ;; ;; let's let the user know once in a long while that load checking ;; ;; is happening but not constantly report it ;; #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time ;; (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload ;; ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp)) ;; ;; (debug:print-info 1 *default-log-port* ;; "On host: " effective-host ;; ", effective load: " effective-load ;; ", numcpus: " numcpus ;; ", normalized effective load: " normalized-effective-load ;; ) ;; ;; (cond ;; ;; bad data, try again to get the data ;; ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable ;; (> num-tries 0)) ;; (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.") ;; (thread-sleep! 10) ;; (common:wait-for-cpuload maxload-in numcpus-in waitdelay ;; count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1))) ;; ;; need to wait for load to drop ;; ((and will-wait ;; (> first adjmaxload) ;; (> count 0)) ;; (debug:print-info 0 *default-log-port* ;; "Delaying " 15 ;; adjwait ;; " seconds due to normalized effective load " normalized-effective-load ;; first ;; " exceeding max of " adjmaxload ;; " on server " (or remote-host (get-host-name)) ;; " (normalized load-limit: " maxload ") " (if msg msg "")) ;; (thread-sleep! 15) ;; adjwait) ;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ;; ((and (> loadjmp (cond ;; (load-jump-limit load-jump-limit) ;; ((> numcpus 8)(/ numcpus 2)) ;; ((> numcpus 4)(/ numcpus 1.2)) ;; (else 0.5))) ;; (> count 0)) ;; (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". " ;; (if msg msg "")) ;; (thread-sleep! adjwait) ;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ;; (else ;; (if (> num-tries 0) ;; (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost"))) ;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")) ;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing.")))))) ;; (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) (if (null? (car uname-res)) "unknown" (caar uname-res)))) |
︙ | ︙ |
Modified runs.scm from [95c765665a] to [96a5410a91].
︙ | ︙ | |||
1219 1220 1221 1222 1223 1224 1225 | ;; jobtools maxload is useful for where the full Megatest run is done on one machine (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload | > | | 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 | ;; jobtools maxload is useful for where the full Megatest run is done on one machine (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f)) ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues (if maxhomehostload (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) (if (or (not (null? tal))(not (null? reg))) |
︙ | ︙ | |||
1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 | ((not items) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-2") (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (let ((loop-list (runs:process-expanded-tests runsdat testdat))) (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated | > > > > > > | 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 | ((not items) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-2") (debug:print-info 4 *default-log-port* "OUTER COND: (not items)") (if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) (not (null? tal))) (loop (car tal)(cdr tal) reg reruns)) (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; This would be a good place to block on homehost load (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running (let ((loop-list (runs:process-expanded-tests runsdat testdat))) (if loop-list (apply loop loop-list)))) ;; items processed into a list but not came in as a list been processed ;; ((and (list? items) ;; thus we know our items are already calculated |
︙ | ︙ |
Modified server.scm from [8058946e40] to [80f5c92415].
︙ | ︙ | |||
133 134 135 136 137 138 139 | " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") " -daemonize " "") ;; " -log " logfile " -m testsuite:" testsuite " " profile-mode )) ;; (conc " >> " logfile " 2>&1 &"))))) | | | | 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | " -server " (or target-host "-") (if (equal? (configf:lookup *configdat* "server" "daemonize") "yes") " -daemonize " "") ;; " -log " logfile " -m testsuite:" testsuite " " profile-mode )) ;; (conc " >> " logfile " 2>&1 &"))))) (log-rotate (make-thread common:rotate-logs "server run, rotate logs thread")) ;; why are we rotating logs here? This is a sensitive location with a lot going on!? (load-limit (configf:lookup-number *configdat* "jobtools" "max-server-start-load" default: 3.0))) ;; we want the remote server to start in *toppath* so push there (push-directory areapath) (debug:print 0 *default-log-port* "INFO: Trying to start server (" cmdln ") ...") (thread-start! log-rotate) ;; host.domain.tld match host? (if (and target-host ;; look at target host, is it host.domain.tld or ip address and does it ;; match current ip or hostname (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) (not (equal? curr-ip target-host))) (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever #;(common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit)) (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) ;; given a path to a server log return: host port startseconds |
︙ | ︙ | |||
315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 | #f))) (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched (define (server:kind-run areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun (call-num (car last-run-dat)) (when-run (cadr last-run-dat)) (run-delay (+ (case call-num ((0) 0) ((1) 20) ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (begin (common:simple-file-lock-and-wait lock-file expire-time: 15) (server:run areapath) (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-url (server:check-if-running areapath)) (try-num 0)) (if (or server-url (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. server-url | > > > > > > > > > > > > > > > > > > > > | 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 | #f))) (define (server:get-client-signature) ;; BB> why is this proc named "get-"? it returns nothing -- set! has not return value. (if *my-client-signature* *my-client-signature* (let ((sig (server:mk-signature))) (set! *my-client-signature* sig) *my-client-signature*))) ;; wait for server=start-last to be three seconds old ;; (define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag))) (if (> (- (current-seconds) fmodtime) 3) ;; good enough (system (conc "touch " start-flag)) ;; lazy but safe (begin (thread-sleep! 5) (server:wait-for-server-start-last-flag areapath)))) (system (conc "touch " start-flag))))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least 3 seconds old (server:wait-for-server-start-last-flag areapath) (if (not (server:check-if-running areapath)) ;; why try if there is already a server running? (let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun (call-num (car last-run-dat)) (when-run (cadr last-run-dat)) (run-delay (+ (case call-num ((0) 0) ((1) 20) ((2) 300) (else 600)) (random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously (lock-file (conc areapath "/logs/server-start.lock"))) (if (> (- (current-seconds) when-run) run-delay) (begin (common:simple-file-lock-and-wait lock-file expire-time: 15) (server:run areapath) (thread-sleep! 2) ;; don't release the lock for at least a few seconds (common:simple-file-release-lock lock-file))) (hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds)))))) ;; this one seems to be the general entry point ;; (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-url (server:check-if-running areapath)) (try-num 0)) (if (or server-url (> (current-seconds) give-up-time)) ;; server-url will be #f if no server available. server-url |
︙ | ︙ |