1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
|
(debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed. exn=" exn)
path) ;; just give up
(with-input-from-pipe
(conc "/bin/readlink -f " path)
(lambda ()
(read-line)))))
(define (common:get-intercept onemin fivemin)
(let* ((load-change (- onemin fivemin))
(tchange (- 300 60)))
(max (+ onemin (* 60 (/ load-change tchange)))0))
)
(define (common:get-delay load-in numcpus)
(max (/ (expt 5 (* 4 (/ load-in numcpus))) 10) 0)
)
(define (get-cpu-load #!key (remote-host #f))
(car (common:get-cpu-load remote-host)))
;; (let* ((load-res (process:cmd-run->list "uptime"))
;; (load-rx (regexp "load average:\\s+(\\d+)"))
;; (cpu-load #f))
|
>
>
>
>
|
|
|
|
|
>
>
>
>
|
<
|
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
|
(debug:print-error 0 *default-log-port* "command \"/bin/readlink -f " path "\" failed. exn=" exn)
path) ;; just give up
(with-input-from-pipe
(conc "/bin/readlink -f " path)
(lambda ()
(read-line)))))
;; returns *effective load* (not normalized)
;;
(define (common:get-intercept onemin fivemin)
(if (< onemin fivemin) ;; load is decreasing, just use the onemin load
onemin
(let* ((load-change (- onemin fivemin))
(tchange (- 300 60)))
(max (+ onemin (* 60 (/ load-change tchange))) 0))))
;; calculate a delay number based on a droop curve
;; inputs are:
;; - load-in, load as from uptime, NOT normalized
;; - numcpus, number of cpus, ideally use the real cpus, not threads
;;
(define (common:get-delay load-in numcpus)
(max (/ (expt 5 (* 4 (/ load-in numcpus))) 10) 0))
(define (get-cpu-load #!key (remote-host #f))
(car (common:get-cpu-load remote-host)))
;; (let* ((load-res (process:cmd-run->list "uptime"))
;; (load-rx (regexp "load average:\\s+(\\d+)"))
;; (cpu-load #f))
|
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
|
(if new-best
(begin ;; found a host, return it
(debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
(host-last-used-set! rec curr-time)
new-best)
(if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
(define (common:wait-for-homehost-load maxload msg)
(let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
#f
(common:get-homehost)))
(hh (if hh-dat (car hh-dat) #f)))
(common:wait-for-normalized-load maxload msg hh)))
(define (common:get-num-cpus remote-host)
(let* ((actual-host (or remote-host (get-host-name))))
(or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
(let* ((proc (lambda ()
(let loop ((numcpu 0)
(inl (read-line)))
|
|
|
|
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
|
(if new-best
(begin ;; found a host, return it
(debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
(host-last-used-set! rec curr-time)
new-best)
(if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
(define (common:wait-for-homehost-load maxnormload msg)
(let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
#f
(common:get-homehost)))
(hh (if hh-dat (car hh-dat) #f)))
(common:wait-for-normalized-load maxnormload msg hh)))
(define (common:get-num-cpus remote-host)
(let* ((actual-host (or remote-host (get-host-name))))
(or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
(let* ((proc (lambda ()
(let loop ((numcpu 0)
(inl (read-line)))
|
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
|
(if (and (number? result)
(> result 0))
(common:write-cached-info actual-host "num-cpus" result))
result))))
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5))
(let ((num-cpus (common:get-num-cpus remote-host)))
(if num-cpus
(common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)
(begin
(thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again
(if (> rem-tries 0)
(common:wait-for-normalized-load maxload msg remote-host (- rem-tries 1))
#f)))))
;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
(let* ((loadavg (common:get-cpu-load remote-host))
(numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again
(common:get-num-cpus remote-host)
numcpus-in))
(maxload (if force-maxload
maxload-in
(if (number? maxload-in)
(max maxload-in 0.5)
0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
(first (car loadavg))
(next (cadr loadavg))
(adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
;; numcpus (or could be
;; maxload) is zero,
;; crude fallback is to
;; at least use 1
(loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
0
next))) ;; we will force a conservative calculation any time next is large.
(first-next-avg (/ (+ first next) 2))
;; add some randomness to the time to break any alignment
;; where netbatch dumps many jobs to machines simultaneously
(adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
(/ (- 1000 count) 10)
waitdelay)
(- first adjmaxload) ))))
(load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit")))
;; let's let the user know once in a long while that load checking
;; is happening but not constantly report it
(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
(cond
((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
(> num-tries 0))
(debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
(thread-sleep! 10)
(common:wait-for-cpuload maxload-in numcpus-in waitdelay
count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
((and (> first adjmaxload)
(> count 0))
(debug:print-info 0 *default-log-port*
"Delaying " adjwait
" seconds due to load " first
" exceeding max of " adjmaxload
" on server " (or remote-host (get-host-name))
" (normalized load-limit: " maxload ") " (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
((and (> loadjmp (cond
(load-jump-limit load-jump-limit)
((> numcpus 8)(/ numcpus 2))
((> numcpus 4)(/ numcpus 1.2))
(else 0.5)))
(> count 0))
(debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
(if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
(else
(if (> num-tries 0)
(if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
(debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
(debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))
(define (get-uname . params)
(let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
(uname #f))
(if (null? (car uname-res))
"unknown"
(caar uname-res))))
|
|
|
|
>
>
|
>
>
|
|
>
|
|
<
<
<
<
<
|
|
|
>
|
|
>
|
|
<
<
<
|
<
<
<
<
|
|
<
<
<
>
|
>
>
>
>
|
>
|
>
>
>
>
|
>
|
|
>
|
|
|
|
|
|
>
>
>
|
<
|
<
>
|
|
<
<
<
<
|
|
>
>
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
|
(if (and (number? result)
(> result 0))
(common:write-cached-info actual-host "num-cpus" result))
result))))
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5))
(let ((num-cpus (common:get-num-cpus remote-host)))
(if num-cpus
(common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host)
(begin
(thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again
(if (> rem-tries 0)
(common:wait-for-normalized-load maxnormload msg remote-host (- rem-tries 1))
#f)))))
;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;; count - count down to zero, at some point we'd give up if the load never drops
;; num-tries - count down to zero number tries to get numcpus
;;
(define (common:wait-for-cpuload maxnormload numcpus-in
#!key (count 1000)
(msg #f)(remote-host #f)(num-tries 5))
(let* ((loadavg (common:get-cpu-load remote-host))
;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again
(numcpus (if (<= 1 numcpus-in)
(common:get-num-cpus remote-host)
numcpus-in))
(first (car loadavg))
(next (cadr loadavg))
(adjmaxload (* maxnormload (max 1 numcpus))) ;; possible bug
;; where numcpus
;; (or could be
;; maxload) is
;; zero, crude
;; fallback is to
;; at least use 1
;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
;; etc.
(effective-load (common:get-intercept first next))
(recommended-delay (common:get-delay effective-load numcpus))
(effective-host (or remote-host "localhost"))
(normalized-effective-load (/ effective-load numcpus))
(will-wait (> normalized-effective-load maxnormload)))
(if (> recommended-delay 0)
(let* ((actual-delay (min recommended-delay 30)))
(debug:print-info 0 *default-log-port* "Load is high, delaying " actual-delay " seconds.")
(thread-sleep! actual-delay)))
(cond
;; bad data, try again to get the data
((not will-wait)
(if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
(debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))
((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
(> num-tries 0))
(debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load "
first ", we'll sleep 10s and try " num-tries " more times.")
(thread-sleep! 10)
(common:wait-for-cpuload maxnormload numcpus-in
count: count remote-host: remote-host num-tries: (- num-tries 1)))
;; need to wait for load to drop
((and will-wait ;; (> first adjmaxload)
(> count 0))
(debug:print-info 0 *default-log-port*
"Delaying 15" ;; adjwait
" seconds due to normalized effective load " normalized-effective-load ;; first
" exceeding max of " adjmaxload
" on server " (or remote-host (get-host-name))
" (normalized load-limit: " maxnormload ") " (if msg msg ""))
(thread-sleep! 15) ;; adjwait)
(common:wait-for-cpuload maxnormload numcpus count: (- count 1) msg: msg remote-host: remote-host)
;; put the message here to indicate came out of waiting
(debug:print-info 1 *default-log-port*
"On host: " effective-host
", effective load: " effective-load
", numcpus: " numcpus
", normalized effective load: " normalized-effective-load
))
;; overloaded and count expired (i.e. went to zero)
(else
(if (> num-tries 0) ;; should be "num-tries-left".
(if (common:low-noise-print 30 (conc (round effective-load) "-load-acceptable-" effective-host))
(debug:print 0 *default-log-port* "Load on " effective-host " is acceptable at effective normalized load of "
effective-normalized-load " continuing."))
(debug:print 0 *default-log-port* "Load on " effective-host ", "
first" could not be retrieved. Giving up and continuing."))))))
;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
;; (define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
;; (let* ((loadavg (common:get-cpu-load remote-host))
;; (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again
;; (common:get-num-cpus remote-host)
;; numcpus-in))
;; (maxload (if force-maxload
;; maxload-in
;; (if (number? maxload-in)
;; (max maxload-in 0.5)
;; 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
;; (first (car loadavg))
;; (next (cadr loadavg))
;; (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
;; ;; numcpus (or could be
;; ;; maxload) is zero,
;; ;; crude fallback is to
;; ;; at least use 1
;; (loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
;; 0
;; next))) ;; we will force a conservative calculation any time next is large.
;; (first-next-avg (/ (+ first next) 2))
;; ;; add some randomness to the time to break any alignment
;; ;; where netbatch dumps many jobs to machines simultaneously
;; (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
;; (/ (- 1000 count) 10)
;; waitdelay)
;; (- first adjmaxload) ))))
;; (load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit"))
;; ;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
;; ;; etc.
;; (effective-load (common:get-intercept first next))
;; (effective-host (or remote-host "localhost"))
;; (normalized-effective-load (/ effective-load numcpus))
;; (will-wait (> normalized-effective-load maxload)))
;;
;; ;; let's let the user know once in a long while that load checking
;; ;; is happening but not constantly report it
;; #;(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
;; (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
;; ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
;;
;; (debug:print-info 1 *default-log-port*
;; "On host: " effective-host
;; ", effective load: " effective-load
;; ", numcpus: " numcpus
;; ", normalized effective load: " normalized-effective-load
;; )
;;
;; (cond
;; ;; bad data, try again to get the data
;; ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
;; (> num-tries 0))
;; (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
;; (thread-sleep! 10)
;; (common:wait-for-cpuload maxload-in numcpus-in waitdelay
;; count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
;; ;; need to wait for load to drop
;; ((and will-wait ;; (> first adjmaxload)
;; (> count 0))
;; (debug:print-info 0 *default-log-port*
;; "Delaying " 15 ;; adjwait
;; " seconds due to normalized effective load " normalized-effective-load ;; first
;; " exceeding max of " adjmaxload
;; " on server " (or remote-host (get-host-name))
;; " (normalized load-limit: " maxload ") " (if msg msg ""))
;; (thread-sleep! 15) ;; adjwait)
;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
;; ((and (> loadjmp (cond
;; (load-jump-limit load-jump-limit)
;; ((> numcpus 8)(/ numcpus 2))
;; ((> numcpus 4)(/ numcpus 1.2))
;; (else 0.5)))
;; (> count 0))
;; (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
;; (if msg msg ""))
;; (thread-sleep! adjwait)
;; (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
;; (else
;; (if (> num-tries 0)
;; (if (common:low-noise-print 30 (conc (round first) "-load-acceptable-" (or remote-host "localhost")))
;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing."))
;; (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))
;;
(define (get-uname . params)
(let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
(uname #f))
(if (null? (car uname-res))
"unknown"
(caar uname-res))))
|