Megatest

Diff
Login

Differences From Artifact [7eb16cae49]:

To Artifact [e484b86e7d]:


226
227
228
229
230
231
232






















233
234
235
236
237
238
239
            (caddr argv))
           (else (car argv))))
         (fullpath (realpath this-script)))
    fullpath))
(define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
(define *common:this-exe-dir*      (pathname-directory *common:this-exe-fullpath*))
(define *common:this-exe-name*     (pathname-strip-directory *common:this-exe-fullpath*))























(define *common:logpro-exit-code->status-sym-alist*
  '( ( 0 . pass )
     ( 1 . fail )
     ( 2 . warn )
     ( 3 . check )
     ( 4 . waived )







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
            (caddr argv))
           (else (car argv))))
         (fullpath (realpath this-script)))
    fullpath))
(define *common:this-exe-fullpath* (common:get-this-exe-fullpath))
(define *common:this-exe-dir*      (pathname-directory *common:this-exe-fullpath*))
(define *common:this-exe-name*     (pathname-strip-directory *common:this-exe-fullpath*))

;; when called from a wrapper I need sometimes to find the calling
;; wrapper, this is for dashboard to find the correct megatest.
;;
(define (common:find-local-megatest #!optional (progname "megatest"))
  (let ((res (filter file-exists?
		     (map (lambda (updir)
			    (let* ((lm  (car (argv)))
				   (dir (pathname-directory lm))
				   (exe (pathname-strip-directory lm)))
			      (conc (if dir (conc dir "/") "")
				    (case (string->symbol exe)
				      ((dboard)    (conc updir progname))
				      ((mtest)     (conc updir progname))
				      ((dashboard) progname)
				      (else exe)))))
			  '("../../" "../")))))
    (if (null? res)
	(begin
	  (debug:print 0 *current-log-port* "Failed to find this executable! Using what can be found on the path")
	  progname)
	(car res))))

(define *common:logpro-exit-code->status-sym-alist*
  '( ( 0 . pass )
     ( 1 . fail )
     ( 2 . warn )
     ( 3 . check )
     ( 4 . waived )
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759







1760
1761


1762
1763
1764
1765
1766
1767
1768
    (lambda ()(list (read)(read)(read))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (handle-exceptions
   exn
   '(99 99 99)
   (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
     (or (common:get-cached-info actual-hostname "cpu-load")
	 (let ((result (if remote-host
			   (map (lambda (res)
				  (if (eof-object? res) 9e99 res))
			        (with-input-from-pipe 
				 (conc "ssh " remote-host " cat /proc/loadavg")
				 (lambda ()(list (read)(read)(read)))))
			   (with-input-from-file "/proc/loadavg" 
			     (lambda ()(list (read)(read)(read)))))))







	   (common:write-cached-info actual-hostname "cpu-load" result)
	                     result)))))



;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)
  (let ((res (common:get-normalized-cpu-load-raw remote-host))







|










>
>
>
>
>
>
>
|
|
>
>







1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
    (lambda ()(list (read)(read)(read))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (handle-exceptions
   exn
   '(-99 -99 -99)
   (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
     (or (common:get-cached-info actual-hostname "cpu-load")
	 (let ((result (if remote-host
			   (map (lambda (res)
				  (if (eof-object? res) 9e99 res))
			        (with-input-from-pipe 
				 (conc "ssh " remote-host " cat /proc/loadavg")
				 (lambda ()(list (read)(read)(read)))))
			   (with-input-from-file "/proc/loadavg" 
			     (lambda ()(list (read)(read)(read)))))))
	   (match
	    result
	    ((l1 l2 l3)
	     (if (and (number? l1)
		      (number? l2)
		      (number? l3))
		 (begin
		   (common:write-cached-info actual-hostname "cpu-load" result)
		   result)
		 '(-1 -1 -1))) ;; -1 is bad result
	    (else '(-2 -2 -2))))))))

;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)
  (let ((res (common:get-normalized-cpu-load-raw remote-host))
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021




2022


2023



2024

2025
2026
2027
2028






2029
2030
2031





2032
2033
2034
2035
2036
2037
2038




2039
2040
2041
2042
2043
2044
2045
	  (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again
	  (if (> rem-tries 0)
	      (common:wait-for-normalized-load maxload msg remote-host (- rem-tries 1))
	      #f)))))

;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1




	 (loadjmp (- first next))


         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously



    ;; let's let the user know once in a long while that load checking is happening but not constantly report it

    (if (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp))
    (cond






     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))





      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))





(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"
	(caar uname-res))))







|









|
>
>
>
>

>
>
|
>
>
>
|
>


|

>
>
>
>
>
>
|

|
>
>
>
>
>






|
>
>
>
>







2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
	  (thread-sleep! (random 60)) ;; we failed to get num cpus. wait a bit and try again
	  (if (> rem-tries 0)
	      (common:wait-for-normalized-load maxload msg remote-host (- rem-tries 1))
	      #f)))))

;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f)(num-tries 5))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjmaxload (* maxload (max 1 numcpus))) ;; possible bug where
					       ;; numcpus (or could be
					       ;; maxload) is zero,
					       ;; crude fallback is to
					       ;; at least use 1
	 (loadjmp (- first next))
	 ;; add some randomness to the time to break any alignment
	 ;; where netbatch dumps many jobs to machines simultaneously
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)
						      (/ (- 1000 count) 10)
						      waitdelay)
						   (- first adjmaxload) ))  )))
    ;; let's let the user know once in a long while that load checking
    ;; is happening but not constantly report it
    (if (> (random 100) 75) ;; about 25% of the time
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjmaxload: " adjmaxload ", loadjmp: " loadjmp))
    (cond
     ((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
	   (> num-tries 0))
      (debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load " first ", we'll sleep 10s and try " num-tries " more times.")
      (thread-sleep! 10)
      (common:wait-for-cpuload maxload-in numcpus-in waitdelay
			       count: count remote-host: remote-host force-maxload: force-maxload num-tries: (- num-tries 1)))
     ((and (> first adjmaxload)
	   (> count 0))
      (debug:print-info 0 *default-log-port*
			"server start delayed " adjwait
			" seconds due to load " first
			" exceeding max of " adjmaxload
			" on server " (or remote-host (get-host-name))
			" (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     (else
      (if (> num-tries 0)
	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") " is acceptable at " first " continuing.")
	  (debug:print 0 *default-log-port* "Load on " (or remote-host "localhost") ", "first" could not be retrieved. Giving up and continuing."))))))

(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"
	(caar uname-res))))