1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
|
;; (car load-res))
;; cpu-load))
;; get values from cached info from dropping file in logs dir
;; e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 5))
(let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log")))
(if (and (file-exists? fullpath)
(file-read-access? fullpath))
(handle-exceptions
exn
#f
(debug:print 2 *default-log-port* "reading file " fullpath)
(let ((real-age (- (current-seconds)(file-change-time fullpath))))
(if (< real-age age)
(with-input-from-file fullpath read)
(begin
(debug:print 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds)to trust, skipping reading it")
#f))))
(begin
(debug:print 2 *default-log-port* "not reading file " fullpath)
#f))))
(define (common:write-cached-info key dtype dat)
(let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log")))
(handle-exceptions
exn
#f
(with-output-to-file fullpath (lambda ()(pp dat))))))
;; get cpu load by reading from /proc/loadavg, return all three values
;;
|
|
|
>
|
>
|
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
|
;; (car load-res))
;; cpu-load))
;; get values from cached info from dropping file in logs dir
;; e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 5))
(let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
(if (and (file-exists? fullpath)
(file-read-access? fullpath))
(handle-exceptions
exn
#f
(debug:print 2 *default-log-port* "reading file " fullpath)
(let ((real-age (- (current-seconds)(file-change-time fullpath))))
(if (< real-age age)
(with-input-from-file fullpath read)
(begin
(debug:print 1 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
#f))))
(begin
(debug:print 2 *default-log-port* "not reading file " fullpath)
#f))))
(define (common:write-cached-info key dtype dat)
(let* ((fulldir (conc *toppath* "/.sysdata"))
(fullpath (conc fulldir "/" key "-" dtype ".log")))
(if (not (file-exists? fulldir))(create-directory fulldir #t))
(handle-exceptions
exn
#f
(with-output-to-file fullpath (lambda ()(pp dat))))))
;; get cpu load by reading from /proc/loadavg, return all three values
;;
|
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
|
(else default))))
(define (common:get-normalized-cpu-load-raw remote-host)
(let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost
(or (common:get-cached-info actual-host "normalized-load")
(let ((data (if remote-host
(with-input-from-pipe
(conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end")
read-lines)
(append
(with-input-from-file "/proc/loadavg"
read-lines)
(with-input-from-file "/proc/cpuinfo"
read-lines)
(list "end"))))
|
|
|
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
|
(else default))))
(define (common:get-normalized-cpu-load-raw remote-host)
(let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost
(or (common:get-cached-info actual-host "normalized-load")
(let ((data (if remote-host
(with-input-from-pipe
(conc "ssh " remote-host " \"cat /proc/loadavg;cat /proc/cpuinfo;echo end\"")
read-lines)
(append
(with-input-from-file "/proc/loadavg"
read-lines)
(with-input-from-file "/proc/cpuinfo"
read-lines)
(list "end"))))
|
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
|
#f
(let loop ((hed (car data))
(tal (cdr data))
(loads #f)
(proc-num 0) ;; processor includes threads
(phys-num 0) ;; physical chip on motherboard
(core-num 0)) ;; core
;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num)
(if (null? tal) ;; have all our data, calculate normalized load and return result
(let* ((act-proc (+ proc-num 1))
(act-phys (+ phys-num 1))
(act-core (+ core-num 1))
(adj-proc-load (/ (car loads) act-proc))
(adj-core-load (/ (car loads) act-core))
(result
|
|
|
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
|
#f
(let loop ((hed (car data))
(tal (cdr data))
(loads #f)
(proc-num 0) ;; processor includes threads
(phys-num 0) ;; physical chip on motherboard
(core-num 0)) ;; core
;;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num)
(if (null? tal) ;; have all our data, calculate normalized load and return result
(let* ((act-proc (+ proc-num 1))
(act-phys (+ phys-num 1))
(act-core (+ core-num 1))
(adj-proc-load (/ (car loads) act-proc))
(adj-core-load (/ (car loads) act-core))
(result
|
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
|
(if new-best
(begin ;; found a host, return it
(debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
(host-last-used-set! rec curr-time)
new-best)
(if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
(let* ((loadavg (common:get-cpu-load remote-host))
(numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again
(common:get-num-cpus remote-host)
numcpus-in))
(maxload (if force-maxload
maxload-in
(max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
(first (car loadavg))
(next (cadr loadavg))
(adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
(loadjmp (- first next))
(adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
(cond
((and (> first adjload)
(> count 0))
(debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
((and (> loadjmp numcpus)
(> count 0))
(debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))
(define (common:wait-for-homehost-load maxload msg)
(let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
#f
(common:get-homehost)))
(hh (if hh-dat (car hh-dat) #f))
(numcpus (common:get-num-cpus hh)))
(common:wait-for-normalized-load maxload msg hh)))
(define (common:get-num-cpus remote-host)
(let* ((actual-host (or remote-host (get-host-name))))
(or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often!
(let* ((proc (lambda ()
(let loop ((numcpu 0)
(inl (read-line)))
(if (eof-object? inl)
(begin
(common:write-cached-info remote-host "num-cpus" numcpu)
numcpu)
(loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
(+ numcpu 1)
numcpu)
(read-line))))))
(result (if remote-host
(with-input-from-pipe
(conc "ssh " remote-host " cat /proc/cpuinfo")
proc)
(with-input-from-file "/proc/cpuinfo" proc))))
(common:write-cached-info actual-host "num-cpus" result)
result))))
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host)
(let ((num-cpus (common:get-num-cpus remote-host)))
(common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)))
(define (get-uname . params)
(let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
(uname #f))
(if (null? (car uname-res))
"unknown"
(caar uname-res))))
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
|
(if new-best
(begin ;; found a host, return it
(debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
(host-last-used-set! rec curr-time)
new-best)
(if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))
(define (common:wait-for-homehost-load maxload msg)
(let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
#f
(common:get-homehost)))
(hh (if hh-dat (car hh-dat) #f)))
(common:wait-for-normalized-load maxload msg hh)))
(define (common:get-num-cpus remote-host)
(let* ((actual-host (or remote-host (get-host-name))))
(or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
(let* ((proc (lambda ()
(let loop ((numcpu 0)
(inl (read-line)))
(if (eof-object? inl)
(begin
(common:write-cached-info actual-host "num-cpus" numcpu)
numcpu)
(loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
(+ numcpu 1)
numcpu)
(read-line))))))
(result (if remote-host
(with-input-from-pipe
(conc "ssh " remote-host " cat /proc/cpuinfo")
proc)
(with-input-from-file "/proc/cpuinfo" proc))))
(common:write-cached-info actual-host "num-cpus" result)
result))))
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host)
(let ((num-cpus (common:get-num-cpus remote-host)))
(common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)))
;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
(let* ((loadavg (common:get-cpu-load remote-host))
(numcpus (if (<= 1 numcpus-in) ;; not possible to have zero. If we get 1, it's possible that we got the previous default, and we should check again
(common:get-num-cpus remote-host)
numcpus-in))
(maxload (if force-maxload
maxload-in
(max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
(first (car loadavg))
(next (cadr loadavg))
(adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
(loadjmp (- first next))
(adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
(cond
((and (> first adjload)
(> count 0))
(debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
((and (> loadjmp numcpus)
(> count 0))
(debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))
(define (get-uname . params)
(let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
(uname #f))
(if (null? (car uname-res))
"unknown"
(caar uname-res))))
|