Megatest

Diff
Login

Differences From Artifact [4d4a2441c8]:

To Artifact [7eb16cae49]:


18
19
20
21
22
23
24
25

26
27

28
29
30
31


32

33
34
35
36
37
38
39
18
19
20
21
22
23
24

25
26

27
28
29
30
31
32
33

34
35
36
37
38
39
40
41







-
+

-
+




+
+
-
+








;;======================================================================

(use srfi-1 data-structures posix regex-case (prefix base64 base64:)
     format dot-locking csv-xml z3 udp ;; sql-de-lite
     hostinfo md5 message-digest typed-records directory-utils stack
     matchable regex posix (srfi 18) extras ;; tcp 
     (prefix nanomsg nmsg:)
     ;; (prefix nanomsg nmsg:)
     (prefix sqlite3 sqlite3:)
     pkts (prefix dbi dbi:)
     (prefix dbi dbi:)
     )

(declare (unit common))
(declare (uses commonmod))
(import (prefix commonmod cmod:))

(import commonmod)
(import pkts)

(include "common_records.scm")


;; (require-library margs)
;; (include "margs.scm")

482
483
484
485
486
487
488
489
490
491




492
493
494
495
496
497
498
484
485
486
487
488
489
490



491
492
493
494
495
496
497
498
499
500
501







-
-
-
+
+
+
+







	(max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age
    (if (not (directory-exists? "logs"))(create-directory "logs"))
    (directory-fold 
     (lambda (file rem)
       (handle-exceptions
	exn
	(begin
	  (debug:print-info 0 *default-log-port* "unable to rotate log " file ", probably handled by another process.")
	  (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	  (print-call-chain (current-error-port)))
	  (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.")
	  (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
	  ;; (print-call-chain (current-error-port)) ;; 
	  )
	(let* ((fullname  (conc "logs/" file))
	       (mod-time  (file-modification-time fullname))
	       (file-age  (- (current-seconds) mod-time)))
	  (hash-table-set! all-files file mod-time)
	  (if (or (and (string-match "^.*.log" file)
		       (> (file-size fullname) 200000))
		  (and (string-match "^server-.*.log" file)
871
872
873
874
875
876
877
878
879
880
881

882
883
884
885
886
887
888
889
890
874
875
876
877
878
879
880




881


882
883
884
885
886
887
888







-
-
-
-
+
-
-







    (if val val default)))

(define (assoc/default key lst . default)
  (let ((res (assoc key lst)))
    (if res (cadr res)(if (null? default) #f (car default)))))

(define (common:get-testsuite-name)
  (or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description.
      (configf:lookup *configdat* "setup" "testsuite" )
      (getenv "MT_TESTSUITE_NAME")
      (if (string? *toppath* )
  (cmod:get-testsuite-name *toppath* *configdat*))
          (pathname-file *toppath*)
          #f))) ;; (pathname-file (current-directory)))))

;; safe getting of toppath
(define (common:get-toppath areapath)
  (or *toppath*
      (if areapath
	  (begin
	    (set! *toppath* areapath)
1195
1196
1197
1198
1199
1200
1201
1202













1203
1204
1205
1206
1207
1208
1209
1193
1194
1195
1196
1197
1198
1199

1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219







-
+
+
+
+
+
+
+
+
+
+
+
+
+








;; use bash to expand a glob. Does NOT handle paths with spaces!
;;
(define (common:bash-glob instr)
  (string-split
   (with-input-from-pipe
       (conc "/bin/bash -c \"echo " instr "\"")
     read-line)))
       read-line)))

;;======================================================================
;; Some safety net stuff
;;======================================================================

;; return input if it is a list or return null
(define (common:list-or-null inlst #!key (ovrd #f)(message #f))
  (if (list? inlst)
      inlst
      (begin
	(if message (debug:print-error 0 *default-log-port* message))
	(or ovrd '()))))
  
;;======================================================================
;; T A R G E T S  ,   S T A T E ,   S T A T U S ,   
;;                    R U N N A M E    A N D   T E S T P A T T
;;======================================================================

;; (map print (map car (hash-table->alist (read-config "runconfigs.config" #f #t))))
1614
1615
1616
1617
1618
1619
1620
1621
1622

1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636

1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1624
1625
1626
1627
1628
1629
1630


1631
1632
1633
1634
1635
1636









1637



1638
1639
1640
1641
1642
1643
1644







-
-
+





-
-
-
-
-
-
-
-
-
+
-
-
-







		    (if (> curr-rownum rownum) curr-rownum rownum)
		    (if (> curr-colnum colnum) curr-colnum colnum)
		    ))))))

;; if it looks like a number -> convert it to a number, else return it
;;
(define (common:lazy-convert inval)
  (let* ((as-num (if (string? inval)(string->number inval) #f)))
    (or as-num inval)))
  (cmod:lazy-convert inval))

;; convert string a=1; b=2; c=a silly thing; d=
;; to '((a . 1)(b . 2)(c . "a silly thing")(d . ""))
;;
(define (common:val->alist val #!key (convert #f))
  (let ((val-list (string-split-fields ";\\s*" val #:infix)))
    (if val-list
	(map (lambda (x)
	       (let ((f (string-split-fields "\\s*=\\s*" x #:infix)))
		 (case (length f)
		   ((0) `(,#f))  ;; null string case
		   ((1) `(,(string->symbol (car f))))
		   ((2) `(,(string->symbol (car f)) . ,(let ((inval (cadr f)))
							 (if convert (common:lazy-convert inval) inval))))
  (cmod:val->alist val #!key (convert #f)))
		   (else f))))
	     val-list)
	'())))

;;======================================================================
;; S Y S T E M   S T U F F
;;======================================================================

;; lazy-safe get file mod time. on any error (file not existing etc.) return 0
;;
1700
1701
1702
1703
1704
1705
1706
1707

1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720

1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737









1738
1739
1740
1741
1742
1743
1744
1698
1699
1700
1701
1702
1703
1704

1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717

1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734

1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750







-
+












-
+
















-
+
+
+
+
+
+
+
+
+







;; 			    (set! cpu-load newval))))))
;; 	      (car load-res))
;;     cpu-load))

;; get values from cached info from dropping file in logs dir
;;  e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 5))
(define (common:get-cached-info key dtype #!key (age 10))
  (if *toppath*
      (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
	(if (and (file-exists? fullpath)
		 (file-read-access? fullpath))
	    (handle-exceptions
	     exn
	     #f
	     (debug:print 2 *default-log-port* "reading file " fullpath)
	     (let ((real-age (- (current-seconds)(file-change-time fullpath)))) 
	       (if (< real-age age)
		   (with-input-from-file fullpath read)
		   (begin
		     (debug:print 1 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
		     (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
		     #f))))
	    (begin
	      (debug:print 2 *default-log-port* "not reading file " fullpath)
	      #f)))
      #f))
 
(define (common:write-cached-info key dtype dat)
  (if *toppath*
      (let* ((fulldir  (conc *toppath* "/.sysdata"))
	     (fullpath (conc fulldir "/" key "-" dtype ".log")))
	(if (not (file-exists? fulldir))(create-directory fulldir #t))
	(handle-exceptions
	 exn
	 #f
	 (with-output-to-file fullpath (lambda ()(pp dat)))))
      #f))
  

(define (common:raw-get-remote-host-load remote-host)
  (handle-exceptions
   exn
   #f ;; more specific handling of errors needed
   (with-input-from-pipe 
    (conc "ssh " remote-host " cat /proc/loadavg")
    (lambda ()(list (read)(read)(read))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (handle-exceptions
   exn
   '(99 99 99)
   (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
1979
1980
1981
1982
1983
1984
1985


1986

1987
1988
1989
1990
1991
1992
1993
1985
1986
1987
1988
1989
1990
1991
1992
1993

1994
1995
1996
1997
1998
1999
2000
2001







+
+
-
+







					 numcpu)
				     (read-line))))))
	       (result (if remote-host
			   (with-input-from-pipe 
			       (conc "ssh " remote-host " cat /proc/cpuinfo")
			     proc)
			   (with-input-from-file "/proc/cpuinfo" proc))))
	  (if (and (number? result)
		   (> result 0))
	  (if (> result 0)(common:write-cached-info actual-host "num-cpus" result))
	      (common:write-cached-info actual-host "num-cpus" result))
	  result))))

;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5))
  (let ((num-cpus (common:get-num-cpus remote-host)))
    (if num-cpus
2009
2010
2011
2012
2013
2014
2015


2016
2017


2018
2019
2020
2021
2022
2023
2024
2017
2018
2019
2020
2021
2022
2023
2024
2025


2026
2027
2028
2029
2030
2031
2032
2033
2034







+
+
-
-
+
+







		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    ;; let's let the user know once in a long while that load checking is happening but not constantly report it
    (if (> (random 100) 75) ;; about 25% of the time
    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
	(debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
			  ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp))
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)