Megatest

Diff
Login

Differences From Artifact [7382d07655]:

To Artifact [8b5ebebcbe]:


57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
                                              (print-error-message exn) ))))
     (debug:print-info 0 *default-log-port* "    -- continuing after nonfatal condition...")
     #f)
   (thunk)))

(define getenv get-environment-variable)
(define (safe-setenv key val)
  (if (substring-index ":" key) ;; variables containing : are for internal use and cannot be environment variables.
      (debug:print-error 4 *default-log-port* "skip setting internal use only variables containing \":\"")
      (if (and (string? val)
	       (string? key))
	  (handle-exceptions
	      exn
	      (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)
	    (setenv key val))
	  (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val))))







|
|







57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
                                              (print-error-message exn) ))))
     (debug:print-info 0 *default-log-port* "    -- continuing after nonfatal condition...")
     #f)
   (thunk)))

(define getenv get-environment-variable)
(define (safe-setenv key val)
  (if (or (substring-index "!" key) (substring-index ":" key)) ;; variables containing : are for internal use and cannot be environment variables.
      (debug:print-error 4 *default-log-port* "skip setting internal use only variables containing \":\" or starting with \"!\"")
      (if (and (string? val)
	       (string? key))
	  (handle-exceptions
	      exn
	      (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)
	    (setenv key val))
	  (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val))))
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527



















1528
1529
1530
1531
1532
1533
1534
1535
     exn
     #f
     (with-output-to-file fullpath (lambda ()(pp dat))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (let* ((actual-hostname (or remote-host (get-host-name))))
    (or (common:get-cached-info actual-hostname "cpu-load")
	(let ((result (if remote-host
			  (map (lambda (res)
				 (if (eof-object? res) 9e99 res))
			       (with-input-from-pipe 
				   (conc "ssh " remote-host " cat /proc/loadavg")
				 (lambda ()(list (read)(read)(read)))))
			  (with-input-from-file "/proc/loadavg" 
			    (lambda ()(list (read)(read)(read)))))))
	  (common:write-cached-info actual-hostname "cpu-load" result)
	  result))))

;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)



















  (let ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost
    (or (common:get-cached-info actual-host "normalized-load")
	(let ((data (if remote-host
			(with-input-from-pipe 
			    (conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end")
			  read-lines)
			(append 
			 (with-input-from-file "/proc/loadavg" 







|

















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|







1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
     exn
     #f
     (with-output-to-file fullpath (lambda ()(pp dat))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
(define (common:get-cpu-load remote-host)
  (let* ((actual-hostname (or remote-host (get-host-name) "localhost")))
    (or (common:get-cached-info actual-hostname "cpu-load")
	(let ((result (if remote-host
			  (map (lambda (res)
				 (if (eof-object? res) 9e99 res))
			       (with-input-from-pipe 
				   (conc "ssh " remote-host " cat /proc/loadavg")
				 (lambda ()(list (read)(read)(read)))))
			  (with-input-from-file "/proc/loadavg" 
			    (lambda ()(list (read)(read)(read)))))))
	  (common:write-cached-info actual-hostname "cpu-load" result)
	  result))))

;; get normalized cpu load by reading from /proc/loadavg and /proc/cpuinfo return all three values and the number of real cpus and the number of threads
;; returns alist '((adj-cpu-load . normalized-proc-load) ... etc.
;;  keys: adj-proc-load, adj-core-load, 1m-load, 5m-load, 15m-load
;;
(define (common:get-normalized-cpu-load remote-host)
  (let ((res (common:get-normalized-cpu-load-raw remote-host))
	(default `((adj-proc-load . 2) ;; there is no right answer
		   (adj-core-load . 2)
		   (1m-load       . 2)
		   (5m-load       . 0) ;; causes a large delta - thus causing default of throttling if stuff goes wrong
		   (15m-load      . 0)
		   (proc          . 1)
		   (core          . 1)
		   (phys          . 1)
		   (error         . #t))))
    (cond
     ((and (list? res)
	   (> (length res) 2))
      res)
     ((eq? res #f)   default) ;; add messages?
     ((eq? res #f) default)   ;; this would be the #eof
     (else default))))

(define (common:get-normalized-cpu-load-raw remote-host)
  (let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost
    (or (common:get-cached-info actual-host "normalized-load")
	(let ((data (if remote-host
			(with-input-from-pipe 
			    (conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end")
			  read-lines)
			(append 
			 (with-input-from-file "/proc/loadavg" 
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711


1712
1713
1714
1715
1716
1717


1718
1719
1720
1721
1722
1723
1724
	      (if new-best
		  (begin ;; found a host, return it
		    (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (< 1 numcpus-in) ;; not possible
		      (common:get-num-cpus remote-host)
		      numcpus-in))


	 (maxload (max maxload-in 0.5)) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (+ (random 10)(/ (- 1000 count) 10) waitdelay))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously


    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)







|




>
>
|




|
>
>







1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
	      (if new-best
		  (begin ;; found a host, return it
		    (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (< 1 numcpus-in) ;; not possible
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )  ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)
1886
1887
1888
1889
1890
1891
1892












































































1893
1894
1895
1896
1897
1898
1899
	     (begin
	       (set! best     (cons disk-num dirpath))
	       (set! bestsize freespc)))))
     (map car disks))
    (if (and best (> bestsize minsize))
	best
	#f))) ;; #f means no disk candidate found













































































;;======================================================================
;; E N V I R O N M E N T   V A R S
;;======================================================================
(define (bb-check-path #!key (msg "check-path: "))
  (let ((path (or (get-environment-variable "PATH") "none")))
    (debug:print-info 0 *default-log-port* (conc msg" : $PATH="path))







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
	     (begin
	       (set! best     (cons disk-num dirpath))
	       (set! bestsize freespc)))))
     (map car disks))
    (if (and best (> bestsize minsize))
	best
	#f))) ;; #f means no disk candidate found

;; convert a spec string to a list of vectors #( rx  action rx-string )
(define (common:spec-string->list-of-specs spec-string actions)
  (let ((spec-strings  (string-split-fields "\\s*;\\s*" spec-string #:infix))
	(actions-regex (regexp (conc "^(.*)\\s+(" (string-intersperse (map conc actions) "|") ")"))))
    (filter
     (lambda (x) x)
     (map (lambda (s)
	    (let ((m (string-match actions-regex s)))
	      (if m
		  (vector (regexp (cadr m))(string->symbol (caddr m))(cadr m))
		  (begin
		    (debug:print 0 *default-log-port* "WARNING: Unrecognised rule \"" s "\" in clean-up specification.")
		    #f))))
	  spec-strings))))
  
;; given a list of specs rx . rule and a file return the first matching rule
;; 
(define (common:file-find-rule fname rules) ;; rule is vector #( rx action rx-string)
  (let loop ((rule   (car rules))
	     (tail   (cdr rules)))
    (let ((rx (vector-ref rule 0))
	  (rn (vector-ref rule 1))) ;; rule name
      (if (string-match rx fname)
	  rule ;; return the whole rule so regex can be printed etc.
	  (if (null? tail)
	      #f
	      (loop (car tail)(cdr tail)))))))

;; given a spec apply some rules to a directory
;;
;;  WARNING: This function will REMOVE files - be sure your spec and path is correct!
;;
;; spec format:
;;   file-regex1 action; file-regex2 action; ...
;; e.g.
;;   .*\.log$ keep; .* remove
;;   --> keep all .log files, remove everything else
;; limitations:
;;   cannot have a rule with ; as part of the spec
;;   not very flexible, would be nice to return binned file names?
;; supported rules:
;;   keep     - keep this file
;;   remove   - remove this file
;;   compress - compress this file
;;
(define (common:dir-clean-up path spec-string #!key (compress "gzip")(actions '(keep remove compress))(remove-empty #f))
  (let* ((specs         (common:spec-string->list-of-specs spec-string actions))
	 (keepers       (make-hash-table))
	 (directories   (make-hash-table)))
    (find-files 
     path
     action: (lambda (p res)
	       (let ((rule  (common:file-find-rule p specs)))
		 (cond
		  ((directory? p)(hash-table-set! directories p #t))
		  (else
		   (case (vector-ref rule 1)
		     ((keep)(hash-table-set! keepers p rule))
		     ((remove)
		      (print "Removing file " p)
		      (delete-file p))
		     ((compress)
		      (print "Compressing file " p)
		      (system (conc compress " " p)))
		     (else
		      (print "No match for file " p))))))))
    (if remove-empty
	(for-each
	 (lambda (d)
	   (if (null? (glob (conc d "/.*")(conc d "/*")))
	       (begin
		 (print "Removing empty directory " d)
		 (delete-directory d))))
	 (sort (hash-table-keys directories) (lambda (a b)(> (string-length a)(string-length b))))))
    ))

;;======================================================================
;; E N V I R O N M E N T   V A R S
;;======================================================================
(define (bb-check-path #!key (msg "check-path: "))
  (let ((path (or (get-environment-variable "PATH") "none")))
    (debug:print-info 0 *default-log-port* (conc msg" : $PATH="path))
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
	 (pktsdir  (if pktsdirs (car pktsdirs) #f))
	 (toppath  (or (configf:lookup mtconf "scratchdat" "toppath")
		       toppath-in))
	 (pdbpath  (or (configf:lookup mtconf "setup"  "pdbpath") pktsdir)))
    (cond
     ((not (and  pktsdir toppath pdbpath))
      (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.")
      (debug:print  0 *default-log-port* "  you need to have pktsdir in the [setup] section."))
     ((not (common:file-exists? pktsdir))
      (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir))
     ((not (equal? (file-owner pktsdir)(current-effective-user-id)))
      (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name)))
     (else
	(let* ((pdb  (open-queue-db pdbpath "pkts.db"
				    schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));"))))







|







2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
	 (pktsdir  (if pktsdirs (car pktsdirs) #f))
	 (toppath  (or (configf:lookup mtconf "scratchdat" "toppath")
		       toppath-in))
	 (pdbpath  (or (configf:lookup mtconf "setup"  "pdbpath") pktsdir)))
    (cond
     ((not (and  pktsdir toppath pdbpath))
      (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.")
      (debug:print  0 *default-log-port* "  you need to have pktsdirs in the [setup] section."))
     ((not (common:file-exists? pktsdir))
      (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir))
     ((not (equal? (file-owner pktsdir)(current-effective-user-id)))
      (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name)))
     (else
	(let* ((pdb  (open-queue-db pdbpath "pkts.db"
				    schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));"))))