Megatest

Check-in [e13fd02294]
Login
Overview
Comment:sync the trunk with 1.65 latest
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: e13fd02294b69a14a650d1e69467ba2a2080821a
User & Date: matt on 2020-02-19 21:16:05
Other Links: manifest | tags
Context
2020-08-03
14:40
Merging v1.65 to trunk check-in: 94cb260871 user: mrwellan tags: trunk
2020-05-31
22:34
Merged v1.66 to trunk - it is now the stable leading edge. check-in: ea3cea484e user: mrwellan tags: trunk-v1.66
2020-02-19
21:16
sync the trunk with 1.65 latest check-in: e13fd02294 user: matt tags: trunk
2020-02-12
12:40
Fixed modulo typo check-in: 7458dc962f user: jmoon18 tags: v1.65, v.16543
2019-12-08
16:46
Merged long-running v1.65 branch to trunk check-in: a11b626717 user: matt tags: trunk
Changes

Modified common.scm from [4250e720f5] to [22542b9de5].

1671
1672
1673
1674
1675
1676
1677
1678

1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689

1690
1691
1692
1693
1694
1695

1696


1697
1698
1699
1700
1701
1702
1703
1671
1672
1673
1674
1675
1676
1677

1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688

1689
1690
1691
1692
1693
1694
1695
1696

1697
1698
1699
1700
1701
1702
1703
1704
1705







-
+










-
+






+
-
+
+







;; 	      (car load-res))
;;     cpu-load))

;; get values from cached info from dropping file in logs dir
;;  e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 5))
  (let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log")))
  (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
    (if (and (file-exists? fullpath)
	     (file-read-access? fullpath))
	(handle-exceptions
	 exn
	 #f
	 (debug:print 2 *default-log-port* "reading file " fullpath)
	 (let ((real-age (- (current-seconds)(file-change-time fullpath)))) 
	   (if (< real-age age)
	       (with-input-from-file fullpath read)
	       (begin
		 (debug:print 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds)to trust, skipping reading it")
		 (debug:print 1 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
		 #f))))
	(begin
	  (debug:print 2 *default-log-port* "not reading file " fullpath)
	  #f))))
 
(define (common:write-cached-info key dtype dat)
  (let* ((fulldir  (conc *toppath* "/.sysdata"))
  (let* ((fullpath (conc *toppath* "/logs/" key "-" dtype ".log")))
	 (fullpath (conc fulldir "/" key "-" dtype ".log")))
    (if (not (file-exists? fulldir))(create-directory fulldir #t))
    (handle-exceptions
     exn
     #f
     (with-output-to-file fullpath (lambda ()(pp dat))))))

;; get cpu load by reading from /proc/loadavg, return all three values
;;
1742
1743
1744
1745
1746
1747
1748
1749

1750
1751
1752
1753
1754
1755
1756
1744
1745
1746
1747
1748
1749
1750

1751
1752
1753
1754
1755
1756
1757
1758







-
+







     (else default))))

(define (common:get-normalized-cpu-load-raw remote-host)
  (let* ((actual-host (or remote-host (get-host-name)))) ;; #f is localhost
    (or (common:get-cached-info actual-host "normalized-load")
	(let ((data (if remote-host
			(with-input-from-pipe 
			    (conc "ssh " remote-host " cat /proc/loadavg;cat /proc/cpuinfo;echo end")
			    (conc "ssh " remote-host " \"cat /proc/loadavg;cat /proc/cpuinfo;echo end\"")
			  read-lines)
			(append 
			 (with-input-from-file "/proc/loadavg" 
			   read-lines)
			 (with-input-from-file "/proc/cpuinfo"
			   read-lines)
			 (list "end"))))
1764
1765
1766
1767
1768
1769
1770
1771

1772
1773
1774
1775
1776
1777
1778
1766
1767
1768
1769
1770
1771
1772

1773
1774
1775
1776
1777
1778
1779
1780







-
+







	      #f
	      (let loop ((hed      (car data))
			 (tal      (cdr data))
			 (loads    #f)
			 (proc-num 0)  ;; processor includes threads
			 (phys-num 0)  ;; physical chip on motherboard
			 (core-num 0)) ;; core
		;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num)
		;;; (print hed ", " loads ", " proc-num ", " phys-num ", " core-num)
		(if (null? tal) ;; have all our data, calculate normalized load and return result
		    (let* ((act-proc (+ proc-num 1))
			   (act-phys (+ phys-num 1))
			   (act-core (+ core-num 1))
			   (adj-proc-load (/ (car loads) act-proc))
			   (adj-core-load (/ (car loads) act-core))
			   (result
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955

1956
1957
1958
1959
1960
1961

1962
1963
1964
1965
1966
1967

1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986





























1987
1988
1989
1990
1991
1992
1993
1919
1920
1921
1922
1923
1924
1925



























1926
1927
1928
1929

1930

1931
1932
1933
1934

1935
1936
1937
1938
1939
1940

1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-




-
+
-




-
+





-
+



















+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







	      (if new-best
		  (begin ;; found a host, return it
		    (debug:print 0 *default-log-port* "INFO: Found host: " new-best " load: " load " last-used: " delta " seconds ago, with job-rate: " job-rate)
		    (host-last-used-set! rec curr-time)
		    new-best)
		  (if (null? tal) #f (loop (car tal)(cdr tal) best-host)))))))))

(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  ))) ;; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))

(define (common:wait-for-homehost-load maxload msg)
  (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
                     #f
                     (common:get-homehost)))
         (hh     (if hh-dat (car hh-dat) #f))
         (hh     (if hh-dat (car hh-dat) #f)))
         (numcpus (common:get-num-cpus hh)))
    (common:wait-for-normalized-load maxload msg hh)))

(define (common:get-num-cpus remote-host)
  (let* ((actual-host (or remote-host (get-host-name))))
    (or (common:get-cached-info actual-host "num-cpus" age: 86400) ;; hosts had better not be changing the number of cpus too often!
    (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
	(let* ((proc   (lambda ()
			 (let loop ((numcpu 0)
				    (inl    (read-line)))
			   (if (eof-object? inl)
			       (begin
				 (common:write-cached-info remote-host "num-cpus" numcpu)
				 (common:write-cached-info actual-host "num-cpus" numcpu)
				 numcpu)
			       (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
					 (+ numcpu 1)
					 numcpu)
				     (read-line))))))
	       (result (if remote-host
			   (with-input-from-pipe 
			       (conc "ssh " remote-host " cat /proc/cpuinfo")
			     proc)
			   (with-input-from-file "/proc/cpuinfo" proc))))
	  (common:write-cached-info actual-host "num-cpus" result)
	  result))))

;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxload msg remote-host)
  (let ((num-cpus (common:get-num-cpus remote-host)))
    (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host)))

;; DO NOT CALL THIS DIRECTLY. It is called from common:wait-for-normalized-load
;;
(define (common:wait-for-cpuload maxload-in numcpus-in waitdelay #!key (count 1000) (msg #f)(remote-host #f)(force-maxload #f))
  (let* ((loadavg (common:get-cpu-load remote-host))
	 (numcpus (if (<= 1 numcpus-in) ;; not possible to have zero.  If we get 1, it's possible that we got the previous default, and we should check again
		      (common:get-num-cpus remote-host)
		      numcpus-in))
	 (maxload (if force-maxload
		      maxload-in
		      (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME?
	 (first   (car loadavg))
	 (next    (cadr loadavg))
	 (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1
	 (loadjmp (- first next))
         (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) ))  )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously
    (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload
		      ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)
    (cond
     ((and (> first adjload)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host))
     ((and (> loadjmp numcpus)
	   (> count 0))
      (debug:print-info 0 *default-log-port* "waiting " adjwait " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg ""))
      (thread-sleep! adjwait)
      (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)))))

(define (get-uname . params)
  (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params)))))
	 (uname #f))
    (if (null? (car uname-res))
	"unknown"
	(caar uname-res))))

Modified megatest-version.scm from [3dd6b10f75] to [8e7922af4c].

16
17
18
19
20
21
22
23

16
17
18
19
20
21
22

23







-
+
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.

;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..

(declare (unit megatest-version))

(define megatest-version 1.6538)
(define megatest-version 1.6543)

Modified rmt.scm from [fff4f6b45b] to [8c67a7dcdf].

20
21
22
23
24
25
26
27

28
29

30
31
32
33
34
35
36
20
21
22
23
24
25
26

27
28

29
30
31
32
33
34
35
36







-
+

-
+








(use format typed-records) ;; RADT => purpose of json format??

(declare (unit rmt))
(declare (uses api))
(declare (uses http-transport))
(include "common_records.scm")
(declare (uses rmtmod))
;; (declare (uses rmtmod))

(import rmtmod)
;; (import rmtmod)

;;
;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!!
;;

;; generate entries for ~/.megatestrc with the following
;;
56
57
58
59
60
61
62
63

64
65
66










67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84

85
86
87
88
89
90
91
56
57
58
59
60
61
62

63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102







-
+



+
+
+
+
+
+
+
+
+
+


















+








(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id

;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected

  (common:telemetry-log (conc "rmt:"(->string cmd))
  #;(common:telemetry-log (conc "rmt:"(->string cmd))
                        payload: `((rid . ,rid)
                                   (params . ,params)))
                          
  (if (> attemptnum 2)
      (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
    
  (cond
   ((> attemptnum 2) (thread-sleep! 0.05))
   ((> attemptnum 10) (thread-sleep! 0.5))
   ((> attemptnum 20) (thread-sleep! 1)))
  (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15)))  
    (begin (rmt:start-server rid) (thread-sleep! 3))) 
  
  
  ;;DOT digraph megatest_state_status {
  ;;DOT   ranksep=0;
  ;;DOT   // rankdir=LR;
  ;;DOT   node [shape="box"];
  ;;DOT "rmt:send-receive" -> MUTEXLOCK;
  ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; }
  ;; do all the prep locked under the rmt-mutex
  (mutex-lock! *rmt-mutex*)
  
  ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote
  ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds.
  ;; 3. do the query, if on homehost use local access
  ;;
  (let* ((start-time    (current-seconds)) ;; snapshot time so all use cases get same value
         (areapath      *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
	 (runremote     (or area-dat
			    *runremote*))
         (attemptnum    (+ 1 attemptnum))
	 (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*)))

    ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity
    ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"];
    ;; DOT INIT_RUNREMOTE -> MUTEXLOCK;
    ;; ensure we have a record for our connection for given area
    (if (not runremote)                   ;; can remove this one. should never get here.         
101
102
103
104
105
106
107
108
109
110



111
112
113
114
115
116
117
112
113
114
115
116
117
118



119
120
121
122
123
124
125
126
127
128







-
-
-
+
+
+







	(thread-sleep! 0.1) ;; since we shouldn't get here, delay a little
	(remote-hh-dat-set! runremote (common:get-homehost)))
    
    ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile)
    (cond
     ;;DOT EXIT;
     ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" }
     ;; give up if more than 15 attempts
     ((> attemptnum 15)
      (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.")
     ;; give up if more than 150 attempts
     ((> attemptnum 150)
      (debug:print 0 *default-log-port* "ERROR: 150 tries to start/connect to server. Giving up.")
      (exit 1))

     ;;DOT CASE2 [label="local\nreadonly\nquery"];
     ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2}
     ;;DOT CASE2 -> "rmt:open-qry-close-locally";
     ;; readonly mode, read request-  handle it - case 2
     ((and readonly-mode
929
930
931
932
933
934
935



























































936

937
938
939
940
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005

1006
1007
1008
1009
1010







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+





(define (rmt:test-set-archive-block-id run-id test-id archive-block-id)
  (rmt:send-receive 'test-set-archive-block-id run-id (list run-id test-id archive-block-id)))

(define (rmt:test-get-archive-block-info archive-block-id)
  (rmt:send-receive 'test-get-archive-block-info #f (list archive-block-id)))


(define (rmtmod:calc-ro-mode runremote *toppath*)
  (if (and runremote
	   (remote-ro-mode-checked runremote))
      (remote-ro-mode runremote)
      (let* ((dbfile  (conc *toppath* "/megatest.db"))
	     (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
	(if runremote
	    (begin
	      (remote-ro-mode-set! runremote ro-mode)
	      (remote-ro-mode-checked-set! runremote #t)
	      ro-mode)
	    ro-mode))))

(define (extras-readonly-mode rmt-mutex log-port cmd params)
  (mutex-unlock! rmt-mutex)
  (debug:print-info 12 log-port "rmt:send-receive, case 3")
  (debug:print 0 log-port "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
  #f)

(define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params)
  (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
  (mutex-lock! *rmt-mutex*)
  (remote-conndat-set!    runremote #f)
  (http-transport:close-connections area-dat: runremote)
  (remote-server-url-set! runremote #f)
  (mutex-unlock! *rmt-mutex*)
  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
  
(define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd)
  (if (and (vector? res)
	   (eq? (vector-length res) 2)
	   (eq? (vector-ref res 1) 'overloaded)) ;; since we are
						 ;; looking at the
						 ;; data to carry the
						 ;; error we'll use a
						 ;; fairly obtuse
						 ;; combo to minimise
						 ;; the chances of
						 ;; some sort of
						 ;; collision.  this
						 ;; is the case where
						 ;; the returned data
						 ;; is bad or the
						 ;; server is
						 ;; overloaded and we
						 ;; want to ease off
						 ;; the queries
      (let ((wait-delay (+ attemptnum (* attemptnum 10))))
	(debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
	(mutex-lock! *rmt-mutex*)
	(http-transport:close-connections area-dat: runremote)
	(set! *runremote* #f) ;; force starting over
	(mutex-unlock! *rmt-mutex*)
	(thread-sleep! wait-delay)
	(rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
      res)) ;; All good, return res

(set-functions rmt:send-receive                       remote-server-url-set!
#;(set-functions rmt:send-receive                       remote-server-url-set!
	       http-transport:close-connections	      remote-conndat-set!
	       debug:print                            debug:print-info
	       remote-ro-mode                         remote-ro-mode-set!
	       remote-ro-mode-checked-set!            remote-ro-mode-checked)

Modified rmtmod.scm from [8f5d01960e] to [7098937896].

54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
54
55
56
57
58
59
60


























































61







-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-

  (set! debug:print                      dbgp)
  (set! debug:print-info                 dbgpinfo)
  (set! remote-ro-mode                   ro-mode)
  (set! remote-ro-mode-set!              ro-mode-set)
  (set! remote-ro-mode-checked-set!      ro-mode-checked-set)
  (set! remote-ro-mode-checked           ro-mode-checked))

(define (rmtmod:calc-ro-mode runremote *toppath*)
  (if (and runremote
	   (remote-ro-mode-checked runremote))
      (remote-ro-mode runremote)
      (let* ((dbfile  (conc *toppath* "/megatest.db"))
	     (ro-mode (not (file-write-access? dbfile)))) ;; TODO: use dbstruct or runremote to figure this out in future
	(if runremote
	    (begin
	      (remote-ro-mode-set! runremote ro-mode)
	      (remote-ro-mode-checked-set! runremote #t)
	      ro-mode)
	    ro-mode))))

(define (extras-readonly-mode rmt-mutex log-port cmd params)
  (mutex-unlock! rmt-mutex)
  (debug:print-info 12 log-port "rmt:send-receive, case 3")
  (debug:print 0 log-port "WARNING: write transaction requested on a readonly area.  cmd="cmd" params="params)
  #f)

(define (extras-transport-failed *default-log-port* *rmt-mutex* attemptnum runremote cmd rid params)
  (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum)
  (mutex-lock! *rmt-mutex*)
  (remote-conndat-set!    runremote #f)
  (http-transport:close-connections area-dat: runremote)
  (remote-server-url-set! runremote #f)
  (mutex-unlock! *rmt-mutex*)
  (debug:print-info 12 *default-log-port* "rmt:send-receive, case  9.1")
  (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
  
(define (extras-transport-succeded *default-log-port* *rmt-mutex* attemptnum runremote res params rid cmd)
  (if (and (vector? res)
	   (eq? (vector-length res) 2)
	   (eq? (vector-ref res 1) 'overloaded)) ;; since we are
						 ;; looking at the
						 ;; data to carry the
						 ;; error we'll use a
						 ;; fairly obtuse
						 ;; combo to minimise
						 ;; the chances of
						 ;; some sort of
						 ;; collision.  this
						 ;; is the case where
						 ;; the returned data
						 ;; is bad or the
						 ;; server is
						 ;; overloaded and we
						 ;; want to ease off
						 ;; the queries
      (let ((wait-delay (+ attemptnum (* attemptnum 10))))
	(debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.")
	(mutex-lock! *rmt-mutex*)
	(http-transport:close-connections area-dat: runremote)
	(set! *runremote* #f) ;; force starting over
	(mutex-unlock! *rmt-mutex*)
	(thread-sleep! wait-delay)
	(rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))
      res)) ;; All good, return res

)

Modified runs.scm from [81684437ca] to [650378342c].

1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1015
1016
1017
1018
1019
1020
1021

1022
1023
1024
1025
1026
1027
1028







-







					(debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met)
					'())))
	 (non-completed           (filter (lambda (x)             ;; remove hed from not completed list, duh, of course it is not completed!
					    (not (equal? x hed)))
					  (runs:calc-not-completed prereqs-not-met)))
	 (loop-list               (list hed tal reg reruns))
	 ;; configure the load runner
	 (numcpus                 (common:get-num-cpus #f))
	 (maxload                 (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3.0")))         ;; use a non-number string to disable
         (maxhomehostload         (string->number (or (configf:lookup *configdat* "jobtools" "maxhomehostload") "2.0"))) ;; use a non-number string to disable
         (waitdelay               (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60"))))
    (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" 
		      (string-intersperse 
		       (map (lambda (t)
			      (if (vector? t)
1117
1118
1119
1120
1121
1122
1123

1124
1125


1126
1127




1128
1129
1130
1131
1132
1133
1134
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127


1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138







+


+
+
-
-
+
+
+
+







	       (and (member 'toplevel testmode) ;;  'toplevel)
		    (null? non-completed)
		    (not (member 'exclusive testmode)))))
      ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
      ;; we are going to reset all the counters for test retries by setting a new hash table
      ;; this means they will increment only when nothing can be run
      (set! *max-tries-hash* (make-hash-table))
      
      ;; well, first lets see if cpu load throttling is enabled. If so wait around until the
      ;; average cpu load is under the threshold before continuing

      ;; jobtools maxload is useful for where the full Megatest run is done on one machine
      (if maxload ;; only gate if maxload is specified
          (common:wait-for-cpuload maxload numcpus waitdelay))
      (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
          (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))

      ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
      (if maxhomehostload
          (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
      
      (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry)
      (runs:incremental-print-results run-id)
      (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
      (runs:shrink-can-run-more-tests-count runsdat)  ;; DELAY TWEAKER (still needed?)
1741
1742
1743
1744
1745
1746
1747





1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764













1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758

1759





1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789

1790
1791
1792
1793
1794
1795
1796







+
+
+
+
+


-

-
-
-
-
-








+
+
+
+
+
+
+
+
+
+
+
+
+









-







	     (conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t))))
       lst))

;; parent-test is there as a placeholder for when parent-tests can be run as a setup step
;;
(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry)
  ;; All these vars might be referenced by the testconfig file reader
  ;; 
  ;; NEED to reprocess testconfig here, ensuring that item variables are available.
  ;; This is for Tal's issue with item-specific env vars not being set for use in skip.
  ;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273
  ;;
  (let* ((test-name    (tests:testqueue-get-testname   test-record))
	 (test-waitons (tests:testqueue-get-waitons    test-record))
	 (test-conf    (tests:testqueue-get-testconfig test-record))
	 (itemdat      (tests:testqueue-get-itemdat    test-record))
	 (test-path    (hash-table-ref all-tests-registry test-name)) ;; (conc *toppath* "/tests/" test-name)) ;; could use tests:get-testconfig here ...
	 (force        (hash-table-ref/default flags "-force" #f))
	 (rerun        (hash-table-ref/default flags "-rerun" #f))
	 (keepgoing    (hash-table-ref/default flags "-keepgoing" #f))
	 (incomplete-timeout (string->number (or (configf:lookup *configdat* "setup" "incomplete-timeout") "x")))
	 (item-path     "")
	 (db           #f)
	 (full-test-name #f))

    ;; setting itemdat to a list if it is #f
    (if (not itemdat)(set! itemdat '()))
    (set! item-path (item-list->path itemdat))
    (set! full-test-name (db:test-make-full-name test-name item-path))
    (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) ;; these may be needed by the launching process

   (let* ((test-conf ;; re-instate the tests:get-testconfig once the kinks are worked out. FIXME!!!
	    ;; (tests:get-testconfig test-name item-path all-tests-registry #t force-create: #t))
	    (tests:testqueue-get-testconfig test-record )) 
	   (test-path    (hash-table-ref all-tests-registry test-name)) ;; (conc *toppath* "/tests/" test-name)) ;; could use tests:get-testconfig here ...

	   (force        (hash-table-ref/default flags "-force" #f))
	   (rerun        (hash-table-ref/default flags "-rerun" #f))
	   (keepgoing    (hash-table-ref/default flags "-keepgoing" #f))
	   (incomplete-timeout (string->number (or (configf:lookup *configdat* "setup" "incomplete-timeout") "x")))
	   )

    (debug:print-info 4 *default-log-port*
		      "\nTESTNAME: " full-test-name 
		      "\n   test-config: " (hash-table->alist test-conf)
		      "\n   itemdat: " itemdat
		      )
    (debug:print 2 *default-log-port* "Attempting to launch test " full-test-name)
    ;; (setenv "MT_TEST_NAME" test-name) ;; 
    ;; (setenv "MT_ITEMPATH"  item-path)
    ;; (setenv "MT_RUNNAME"   runname)
    (runs:set-megatest-env-vars run-id inrunname: runname testname: test-name itempath: item-path) ;; these may be needed by the launching process
    (change-directory *toppath*)

    ;; Here is where the test_meta table is best updated
    ;; Yes, another use of a global for caching. Need a better way?
    ;;
    ;; There is now a single call to runs:update-all-test_meta and this 
    ;; per-test call is not needed. Given the delicacy of the move to 
1879
1880
1881
1882
1883
1884
1885

1886
1887
1888
1889






1890
1891
1892
1893
1894






1895
1896

1897
1898
1899
1900
1901
1902
1903
1894
1895
1896
1897
1898
1899
1900
1901




1902
1903
1904
1905
1906
1907
1908




1909
1910
1911
1912
1913
1914
1915

1916
1917
1918
1919
1920
1921
1922
1923







+
-
-
-
-
+
+
+
+
+
+

-
-
-
-
+
+
+
+
+
+

-
+







		  ((and skip-check
			(configf:lookup test-conf "skip" "prevrunning"))
		   ;; run-ids = #f means *all* runs
		   (let ((running-tests (rmt:get-tests-for-runs-mindata #f full-test-name '("RUNNING" "REMOTEHOSTSTART" "LAUNCHED") '() #f)))
		     (if (not (null? running-tests)) ;; have to skip 
			 (set! skip-test "Skipping due to previous tests running"))))

 		    ;; split the string and OR of file-exists?
		  ((and skip-check
			(configf:lookup test-conf "skip" "fileexists"))
		   (if (common:file-exists? (configf:lookup test-conf "skip" "fileexists"))
		       (set! skip-test (conc "Skipping due to existance of file " (configf:lookup test-conf "skip" "fileexists")))))
		    ((and skip-check
			  (configf:lookup test-conf "skip" "fileexists"))
		     (let* ((files    (string-split (configf:lookup test-conf "skip" "fileexists")))
			    (existing (filter common:file-exists? files)))
		       (if (not (null? existing)) ;; (common:file-exists? (configf:lookup test-conf "skip" "fileexists"))
			   (set! skip-test (conc "Skipping due to existance of file(s) " (string-intersperse existing ", ")))))) ;; (configf:lookup test-conf "skip" "fileexists")))))

		  ((and skip-check
			(configf:lookup test-conf "skip" "filenotexists"))
		   (if (not (common:file-exists? (configf:lookup test-conf "skip" "filenotexists")))
		       (set! skip-test (conc "Skipping due to non existance of file " (configf:lookup test-conf "skip" "filenotexists")))))
 		    ((and skip-check
			  (configf:lookup test-conf "skip" "filenotexists"))
		      (let* ((files    (string-split (configf:lookup test-conf "skip" "filenotexists")))
			      (existing (filter common:file-exists? files)))
			      (if (null? existing) ;;  (common:file-exists? (configf:lookup test-conf "skip" "filenotexists")))
			        (set! skip-test (conc "Skipping due to non existance of files " (string-intersperse files ", ")))))) ;; (configf:lookup test-conf "skip" "filenotexists")))))        
		 
                  ((and skip-check
       ((and skip-check
			(configf:lookup test-conf "skip" "script"))
		   (if (= (system (configf:lookup test-conf "skip" "script")) 0)
                        (set! skip-test (conc "Skipping due to zero return value of script " (configf:lookup test-conf "skip" "script")))))
      
                  ((and skip-check
			(configf:lookup test-conf "skip" "rundelay"))
		   ;; run-ids = #f means *all* runs
1938
1939
1940
1941
1942
1943
1944
1945

1946
1947
1948
1949
1950
1951
1952
1958
1959
1960
1961
1962
1963
1964

1965
1966
1967
1968
1969
1970
1971
1972







-
+







	;;      (debug:print 2 *default-log-port* "NOTE: " test-name " is already running")))
	(else      
	 (debug:print-error 0 *default-log-port* "Failed to launch test " full-test-name ". Unrecognised state " (test:get-state testdat))
	 (case (string->symbol (test:get-state testdat)) 
	   ((COMPLETED INCOMPLETE)
	    (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN))
	   (else
	    (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN))))))))
	    (hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)))))))))

;;======================================================================
;; END OF NEW STUFF
;;======================================================================

(define (get-dir-up-n dir . params) 
  (let ((dparts  (string-split dir "/"))

Modified server.scm from [78810e8804] to [b1a1de6e53].

1
2
3
4
5
6
7
8

1
2
3
4
5
6
7
-








;; Copyright 2006-2017, Matthew Welland.
;; 
;; This file is part of Megatest.
;; 
;;     Megatest is free software: you can redistribute it and/or modify
;;     it under the terms of the GNU General Public License as published by
;;     the Free Software Foundation, either version 3 of the License, or
149
150
151
152
153
154
155
156


157
158
159
160
161
162
163
148
149
150
151
152
153
154

155
156
157
158
159
160
161
162
163







-
+
+







	     (not (equal? curr-ip target-host)))
	(begin
	  (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile)
	  (setenv "TARGETHOST" target-host)))
      
    (setenv "TARGETHOST_LOGF" logfile)
    (thread-sleep! (/ (random 5000) 1000)) ;; add about a random (up to 5 seconds) initial delay. It seems pretty common that many running tests request a server at the same time
    (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    ;; (common:wait-for-normalized-load load-limit " delaying server start due to load" target-host) ;; do not try starting servers on an already overloaded machine, just wait forever
    (common:wait-for-homehost-load load-limit (conc " delaying server start due to load on homehost. limit is " load-limit))
    (system (conc "nbfake " cmdln))
    (unsetenv "TARGETHOST_LOGF")
    (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST"))
    (thread-join! log-rotate)
    (pop-directory)))

;; given a path to a server log return: host port startseconds