Megatest

Diff
Login

Differences From Artifact [4d4a2441c8]:

To Artifact [e0587bf118]:


482
483
484
485
486
487
488
489
490
491




492
493
494
495
496
497
498
482
483
484
485
486
487
488



489
490
491
492
493
494
495
496
497
498
499







-
-
-
+
+
+
+







	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    (directory-fold 
     (lambda (file rem)
       (handle-exceptions
	exn
	(begin
	  (debug:print-info 0 *default-log-port* "unable to rotate log " file ", probably handled by another process.")
	  (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	  (print-call-chain (current-error-port)))
	  (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.")
	  (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	  ;; (print-call-chain (current-error-port)) ;; 
	  )
	(let* ((fullname  (conc "logs/" file))
	       (mod-time  (file-modification-time fullname))
	       (file-age  (- (current-seconds) mod-time)))
	  (hash-table-set! all-files file mod-time)
	  (if (or (and (string-match "^.*.log" file)
		       (> (file-size fullname) 200000))
		  (and (string-match "^server-.*.log" file)
1195
1196
1197
1198
1199
1200
1201
1202













1203
1204
1205
1206
1207
1208
1209
1196
1197
1198
1199
1200
1201
1202

1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222







-
+
+
+
+
+
+
+
+
+
+
+
+
+








;; use bash to expand a glob. Does NOT handle paths with spaces!
;;
(define (common:bash-glob instr)
  (string-split
   (with-input-from-pipe
       (conc "/bin/bash -c \"echo " instr "\"")
     read-line)))
       read-line)))

;;======================================================================
;; Some safety net stuff
;;======================================================================

;; return input if it is a list or return null
(define (common:list-or-null inlst #!key (ovrd #f)(message #f))
  (if (list? inlst)
      inlst
      (begin
	(if message (debug:print-error 0 *default-log-port* message))
	(or ovrd '()))))
  
;;======================================================================
;; T A R G E T S  ,   S T A T E ,   S T A T U S ,   
;;                    R U N N A M E    A N D   T E S T P A T T
;;======================================================================

;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t))))
1700
1701
1702
1703
1704
1705
1706
1707

1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720

1721
1722
1723
1724
1725
1726
1727
1713
1714
1715
1716
1717
1718
1719

1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732

1733
1734
1735
1736
1737
1738
1739
1740







-
+












-
+







;; 			    (set! cpu-load newval))))))
;; 	      (car load-res))
;;     cpu-load))

;; get values from cached info from dropping file in logs dir
;;  e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 5))
(define (common:get-cached-info key dtype #!key (age 10))
  (if *toppath*
      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
	(if (and (file-exists? fullpath)
		 (file-read-access? fullpath))
	    (handle-exceptions
	     exn
	     #f
	     (debug:print 2 *default-log-port* "reading file " fullpath)
	     (let ((real-age (- (current-seconds)(file-change-time fullpath)))) 
	       (if (< real-age age)
		   (with-input-from-file fullpath read)
		   (begin
		     (debug:print 1 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
		     (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
		     #f))))
	    (begin
	      (debug:print 2 *default-log-port* "not reading file " fullpath)
	      #f)))
      #f))
 
(define (common:write-cached-info key dtype dat)
1979
1980
1981
1982
1983
1984
1985


1986

1987
1988
1989
1990
1991
1992
1993
1992
1993
1994
1995
1996
1997
1998
1999
2000

2001
2002
2003
2004
2005
2006
2007
2008







+
+
-
+







					 numcpu)
				     (read-line))))))
	       (result (if remote-host
			   (with-input-from-pipe 
			       (conc "ssh " remote-host " cat /proc/cpuinfo")
			     proc)
			   (with-input-from-file "/proc/cpuinfo" proc))))
	  (if (and (number? result)
		   (> result 0))
	  (if (> result 0)(common:write-cached-info actual-host "num-cpus" result))
	      (common:write-cached-info actual-host "num-cpus" result))
	  result))))

;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5))
  (let ((num-cpus (common:get-num-cpus remote-host)))
    (if num-cpus
2009
2010
2011
2012
2013
2014
2015


2016
2017


2018
2019
2020
2021
2022
2023
2024
2024
2025
2026
2027
2028
2029
2030
2031
2032


2033
2034
2035
2036
2037
2038
2039
2040
2041







+
+
-
-
+
+







		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    ;; let's let the user know once in a long while that load checking is happening but not constantly report it
    (if (> (random 100) 75) ;; about 25% of the time
    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp))
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)