︙ | | | ︙ | |
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
|
(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
(if (not (directory-exists? "logs"))(create-directory "logs"))
(directory-fold
(lambda (file rem)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "unable to rotate log " file ", probably handled by another process.")
(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
(print-call-chain (current-error-port)))
(let* ((fullname (conc "logs/" file))
(mod-time (file-modification-time fullname))
(file-age (- (current-seconds) mod-time)))
(hash-table-set! all-files file mod-time)
(if (or (and (string-match "^.*.log" file)
(> (file-size fullname) 200000))
(and (string-match "^server-.*.log" file)
|
|
|
|
>
|
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
|
(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
(if (not (directory-exists? "logs"))(create-directory "logs"))
(directory-fold
(lambda (file rem)
(handle-exceptions
exn
(begin
(debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.")
(debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
;; (print-call-chain (current-error-port)) ;;
)
(let* ((fullname (conc "logs/" file))
(mod-time (file-modification-time fullname))
(file-age (- (current-seconds) mod-time)))
(hash-table-set! all-files file mod-time)
(if (or (and (string-match "^.*.log" file)
(> (file-size fullname) 200000))
(and (string-match "^server-.*.log" file)
|
︙ | | | ︙ | |
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
|
;; (set! cpu-load newval))))))
;; (car load-res))
;; cpu-load))
;; get values from cached info from dropping file in logs dir
;; e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 5))
(if *toppath*
(let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
(if (and (file-exists? fullpath)
(file-read-access? fullpath))
(handle-exceptions
exn
#f
(debug:print 2 *default-log-port* "reading file " fullpath)
(let ((real-age (- (current-seconds)(file-change-time fullpath))))
(if (< real-age age)
(with-input-from-file fullpath read)
(begin
(debug:print 1 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
#f))))
(begin
(debug:print 2 *default-log-port* "not reading file " fullpath)
#f)))
#f))
(define (common:write-cached-info key dtype dat)
|
|
|
|
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
|
;; (set! cpu-load newval))))))
;; (car load-res))
;; cpu-load))
;; get values from cached info from dropping file in logs dir
;; e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 10))
(if *toppath*
(let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
(if (and (file-exists? fullpath)
(file-read-access? fullpath))
(handle-exceptions
exn
#f
(debug:print 2 *default-log-port* "reading file " fullpath)
(let ((real-age (- (current-seconds)(file-change-time fullpath))))
(if (< real-age age)
(with-input-from-file fullpath read)
(begin
(debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
#f))))
(begin
(debug:print 2 *default-log-port* "not reading file " fullpath)
#f)))
#f))
(define (common:write-cached-info key dtype dat)
|
︙ | | | ︙ | |
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
|
numcpu)
(read-line))))))
(result (if remote-host
(with-input-from-pipe
(conc "ssh " remote-host " cat /proc/cpuinfo")
proc)
(with-input-from-file "/proc/cpuinfo" proc))))
(if (> result 0)(common:write-cached-info actual-host "num-cpus" result))
result))))
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5))
(let ((num-cpus (common:get-num-cpus remote-host)))
(if num-cpus
|
>
>
|
|
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
|
numcpu)
(read-line))))))
(result (if remote-host
(with-input-from-pipe
(conc "ssh " remote-host " cat /proc/cpuinfo")
proc)
(with-input-from-file "/proc/cpuinfo" proc))))
(if (and (number? result)
(> result 0))
(common:write-cached-info actual-host "num-cpus" result))
result))))
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5))
(let ((num-cpus (common:get-num-cpus remote-host)))
(if num-cpus
|
︙ | | | ︙ | |
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
|
maxload-in
(max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
(first (car loadavg))
(next (cadr loadavg))
(adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
(loadjmp (- first next))
(adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
(cond
((and (> first adjload)
(> count 0))
(debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
((and (> loadjmp numcpus)
|
>
>
|
|
|
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
|
maxload-in
(max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
(first (car loadavg))
(next (cadr loadavg))
(adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
(loadjmp (- first next))
(adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
;; let's let the user know once in a long while that load checking is happening but not constantly report it
(if (> (random 100) 75) ;; about 25% of the time
(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
", load: " first ", adjload: " adjload ", loadjmp: " loadjmp))
(cond
((and (> first adjload)
(> count 0))
(debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
(thread-sleep! adjwait)
(common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
((and (> loadjmp numcpus)
|
︙ | | | ︙ | |