2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
|
;; at least use 1
(loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
0
next))) ;; we will force a conservative calculation any time next is large.
(first-next-avg (/ (+ first next) 2))
;; add some randomness to the time to break any alignment
;; where netbatch dumps many jobs to machines simultaneously
(adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
(/ (- 1000 count) 10)
waitdelay)
(- first adjmaxload) )) )))
;; let's let the user know once in a long while that load checking
;; is happening but not constantly report it
(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
(cond
((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
|
|
|
|
|
>
|
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
|
;; at least use 1
(loadjmp (- first (if (> next (* numcpus 0.7)) ;; could do something with average of first and next?
0
next))) ;; we will force a conservative calculation any time next is large.
(first-next-avg (/ (+ first next) 2))
;; add some randomness to the time to break any alignment
;; where netbatch dumps many jobs to machines simultaneously
(adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
(/ (- 1000 count) 10)
waitdelay)
(- first adjmaxload) ))))
(load-jump-limit (configf:lookup-number *configdat* "setup" "load-jump-limit")))
;; let's let the user know once in a long while that load checking
;; is happening but not constantly report it
(if (common:low-noise-print 30 (conc "cpuload" (or remote-host "localhost"))) ;; (> (random 100) 75) ;; about 25% of the time
(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
(cond
((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
|
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
|
" seconds due to load " first
" exceeding max of " adjmaxload
" on server " (or remote-host (get-host-name))
" (normalized load-limit: " maxload ") " (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
((and (> loadjmp (cond
((> numcpus 8)(/ numcpus 4))
((> numcpus 4)(/ numcpus 2))
(else 0.5)))
(> count 0))
(debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
(if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
(else
|
>
|
|
|
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
|
" seconds due to load " first
" exceeding max of " adjmaxload
" on server " (or remote-host (get-host-name))
" (normalized load-limit: " maxload ") " (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
((and (> loadjmp (cond
(load-jump-limit load-jump-limit)
((> numcpus 8)(/ numcpus 2))
((> numcpus 4)(/ numcpus 1.2))
(else 0.5)))
(> count 0))
(debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to possible load jump " loadjmp ". "
(if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
(else
|