Megatest

Check-in [6e33de13e0]
Login
Overview
Comment:Clean up that broke stuff :( - reapply needed
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | broken-fixes
Files: files | file ages | folders
SHA1: 6e33de13e069e95b680ee5b8430bb718797fe4f4
User & Date: mrwellan on 2014-02-18 13:28:29
Other Links: branch diff | manifest | tags
Context
2014-03-03
08:56
Bringing these changes forward to verify they were accounted for Closed-Leaf check-in: b137ace97d user: mrwellan tags: broken-fixes
2014-02-18
13:28
Clean up that broke stuff :( - reapply needed check-in: 6e33de13e0 user: mrwellan tags: broken-fixes
06:48
Merging in old v1.60 branch to create new v1.60 branch check-in: 7d4d4f4f88 user: mrwellan tags: v1.60
Changes

Modified client.scm from [5cb1c0c7dc] to [f6d1b77f60].

52
53
54
55
56
57
58
59
60
61
62
63
64

65
66
67
68
69
70





71
72
73

74
75
76
77
78
79


80
81
82
83
84
85
86
87
88







89
90
91

92
93
94
95
96
97
98

99
100
101
102
103

104
105
106
107
108
109
110
52
53
54
55
56
57
58

59
60
61
62
63
64
65





66
67
68
69
70


71
72
73
74
75
76


77
78
79
80







81
82
83
84
85
86
87


88
89
90
91
92
93
94
95

96
97
98
99
100

101
102
103
104
105
106
107
108







-





+

-
-
-
-
-
+
+
+
+
+
-
-

+




-
-
+
+


-
-
-
-
-
-
-
+
+
+
+
+
+
+
-
-

+






-
+




-
+







;;      *transport-type* and *runremote* from the monitor.db
;;
;; client:setup
;;
;; lookup_server, need to remove *runremote* stuff
;;
(define (client:setup run-id #!key (remaining-tries 10))
  (debug:print 0 "INFO: client:setup remaining-tries=" remaining-tries)
  (if (<= remaining-tries 0)
      (begin
	(debug:print 0 "ERROR: failed to start or connect to server for run-id " run-id)
	(exit 1))
      (let ((server-dat (and run-id (hash-table-ref/default *runremote* run-id #f))))
	(thread-sleep! 1) ;; try to avoid race conditons
	(if server-dat
	    (let ((start-res (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result!
							    (tasks:hostinfo-get-interface server-dat)
							    (tasks:hostinfo-get-port      server-dat))))
	      (if start-res ;; sucessful login?
		  (begin
	    (let ((new-dat (http-transport:client-connect run-id ;; NB// confusion over server-dat and connection result!
							    (car  server-dat)
							    (cadr server-dat))))
	      (if new-dat ;; sucessful login?
		  new-dat
		    (hash-table-set! *runremote* run-id start-res)
		    start-res)
		  (begin    ;; login failed
		    (debug:print 0 "INFO: login failed in client:setup with existing server-dat: " server-dat ", new-dat: " new-dat ", cleaning out records and then trying again")
		    (hash-table-delete! *runremote* run-id)
		    (open-run-close tasks:server-force-clean-run-record
				    tasks:open-db
				    run-id 
				    (tasks:hostinfo-get-interface server-dat)
				    (tasks:hostinfo-get-port      server-dat))
				    (car  server-dat)
				    (cadr server-dat))
		    (thread-sleep! 5)
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))
	    (let* ((server-dat (open-run-close tasks:get-server tasks:open-db run-id)))
	      (if server-dat
		  (let ((start-res (http-transport:client-connect run-id
								  (tasks:hostinfo-get-interface server-dat)
								  (tasks:hostinfo-get-port      server-dat))))
		    (if start-res
			(begin
	    (let* ((server-info (open-run-close tasks:get-server tasks:open-db run-id)))
	      (if server-info
		  (let ((new-dat (http-transport:client-connect run-id
								  (tasks:hostinfo-get-interface server-info)
								  (tasks:hostinfo-get-port      server-info))))
		    (if new-dat
			new-dat
			  (hash-table-set! *runremote* run-id start-res)
			  start-res)
			(begin    ;; login failed
			  (debug:print 0 "INFO: login failed in client:setup with no existing server-dat: " server-dat ", new-dat: " new-dat ", and server-info: " server-info ", cleaning out records and then trying again")
			  (hash-table-delete! *runremote* run-id)
			  (open-run-close tasks:server-force-clean-run-record
					  tasks:open-db
					  run-id 
					  (tasks:hostinfo-get-interface server-dat)
					  (tasks:hostinfo-get-port      server-dat))
			  (thread-sleep! 2)
			  ;; (thread-sleep! 2)
			  (server:try-running run-id)
			  (thread-sleep! 5) ;; give server a little time to start up
			  (client:setup run-id remaining-tries: (- remaining-tries 1)))))
		  (begin    ;; no server registered
		    (thread-sleep! 2)
		    ;; (thread-sleep! 2)
		    (server:try-running run-id)
		    (thread-sleep! 5) ;; give server a little time to start up
		    (client:setup run-id remaining-tries: (- remaining-tries 1)))))))))

;; keep this as a function to ease future 
(define (client:start run-id server-info)
  (http-transport:client-connect run-id 

Modified http-transport.scm from [fdad451b60] to [38152c3968].

265
266
267
268
269
270
271
272
273
274



275
276
277
278
279
280


281
282
283
284
285
286
287
265
266
267
268
269
270
271



272
273
274
275
276
277
278


279
280
281
282
283
284
285
286
287







-
-
-
+
+
+




-
-
+
+








;;
;; connect
;;
(define (http-transport:client-connect run-id iface port)
  (let* ((uri-dat     (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/ctrl"))))
	 (uri-api-dat (make-request method: 'POST uri: (uri-reference (conc "http://" iface ":" port "/api"))))
	 (serverdat   (list iface port uri-dat uri-api-dat))
	 (login-res   (rmt:login-no-auto-client-setup serverdat run-id)))
    (hash-table-set! *runremote* run-id serverdat) ;; may or may not be good ...
	 (server-dat  (list iface port uri-dat uri-api-dat))
	 (login-res   (rmt:login-no-auto-client-setup server-dat run-id)))
    ;; (hash-table-set! *runremote* run-id serverdat) ;; may or may not be good ...
    (if (and (list? login-res)
	     (car login-res))
	(begin
	  (debug:print-info 2 "Logged in and connected to " iface ":" port)
	  (hash-table-set! *runremote* run-id serverdat)
	  serverdat)
	  (hash-table-set! *runremote* run-id server-dat)
	  server-dat)
	(begin
	  (debug:print-info 0 "ERROR: Failed to login or connect to " iface ":" port)
	  #f))))

;; run http-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;

Modified rmt.scm from [a5c523b577] to [2624718f57].

42
43
44
45
46
47
48
49

50
51
52
53
54
55
56
42
43
44
45
46
47
48

49
50
51
52
53
54
55
56







-
+







	 (connection-info (let ((cinfo (hash-table-ref/default *runremote* run-id #f)))
			    (if cinfo
				cinfo
				(let loop ((numtries 100))
				  (thread-sleep! 1)
				  (let ((res (client:setup run-id)))
				    (if res 
					(hash-table-ref/default *runremote* run-id #f) ;; client:setup filled this in (hopefully)
					(hash-table-ref *runremote* run-id) ;; client:setup filled this in (hopefully)
					(if (> numtries 0)
					    (loop (- numtries 1))
					    (begin
					      (debug:print 0 "ERROR: 100 tries and no server, giving up")
					      (exit 1)))))))))
	 (jparams         (db:obj->string params))
	 (res (http-transport:client-api-send-receive run-id connection-info cmd jparams)))

Modified server.scm from [8eb4730569] to [b8b02ced57].

105
106
107
108
109
110
111
112



113
114
115
116
117
118
119
120


121
122

123
124
125
126
127
128




129
130
131
132

133
134
135
105
106
107
108
109
110
111

112
113
114
115
116
117
118
119
120


121
122
123

124
125
126
127
128
129

130
131
132
133
134
135
136
137
138
139
140
141







-
+
+
+






-
-
+
+

-
+





-
+
+
+
+




+



(define (server:reply return-addr query-sig success/fail result)
  (db:obj->string (vector success/fail query-sig result)))

;; > file 2>&1 
(define (server:try-running run-id)
  (let* ((rand-name (random 100))
	 (cmdln (conc (if (getenv "MT_MEGATEST") (getenv "MT_MEGATEST") "megatest")
		     " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id "-" rand-name ".log 2>&1 &")))
		     " -server - -run-id " run-id " name=" rand-name " > " *toppath* "/db/" run-id
		     ".log 2>&1 &")))
		     ;; ".log &" )))
    (debug:print 0 "INFO: Starting server (" cmdln ") as none running ...")
    (push-directory *toppath*)
    (system cmdln)
    (pop-directory)))

(define (server:check-if-running run-id)
  (let loop ((server (open-run-close tasks:get-server tasks:open-db run-id))
	     (trycount 0))
  (let loop ((server-info (open-run-close tasks:get-server tasks:open-db run-id))
	     (trycount    0))
    (thread-sleep! 2)
    (if server
    (if server-info
	;; note: client:start will set *runremote*. this needs to be changed
	;;       also, client:start will login to the server, also need to change that.
	;;
	;; client:start returns #t if login was successful.
	;;
	(let ((res (client:start run-id server)))
	(let ((res (http-transport:client-connect
		    run-id 
		    (tasks:hostinfo-get-interface server-info)
		    (tasks:hostinfo-get-port server-info))))
	  ;; if the server didn't respond we must remove the record
	  (if res
	      res
	      (begin
		(debug:print 0 "WARNING: running server not reachable, removing record: " server-info)
		(open-run-close tasks:server-force-clean-running-records-for-run-id tasks:open-db run-id)
		res)))
	#f)))

Modified tasks.scm from [19f1225d86] to [6da0e18c74].

93
94
95
96
97
98
99
100

101
102
103
104
105
106
107
93
94
95
96
97
98
99

100
101
102
103
104
105
106
107







-
+







(define (tasks:hostinfo-get-hostname    vec)    (vector-ref  vec 6))

(define (tasks:server-lock-slot mdb run-id)
  (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (if (< (tasks:num-in-available-state mdb run-id) 4)
      (begin 
	(tasks:server-set-available mdb run-id)
	(thread-sleep! 2) ;; Try removing this. It may not be needed.
	(thread-sleep! 0.2) ;; Try removing this. It may not be needed.
	(tasks:server-am-i-the-server? mdb run-id))
      #f))
	
;; register that this server may come online (first to register goes though with the process)
(define (tasks:server-set-available mdb run-id)
  (sqlite3:execute 
   mdb 
126
127
128
129
130
131
132
133

134
135
136
137
138
139
140
126
127
128
129
130
131
132

133
134
135
136
137
138
139
140







-
+







       (set! res num-in-queue))
     mdb
     "SELECT count(id) FROM servers WHERE run_id=?;"
     run-id)
    res))

(define (tasks:server-clean-out-old-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 10 AND run_id=?;" run-id))
  (sqlite3:execute mdb "DELETE FROM servers WHERE state in ('available','shutting-down') AND (strftime('%s','now') - start_time) > 100 AND run_id=?;" run-id))

(define (tasks:server-force-clean-running-records-for-run-id mdb run-id)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state = 'running' AND run_id=?;" run-id))

(define (tasks:server-force-clean-run-record mdb run-id iface port)
  (sqlite3:execute mdb "DELETE FROM servers WHERE state = 'running' AND run_id=? AND interface=? AND port=?;"
		   run-id iface port))
182
183
184
185
186
187
188
189
190

191
192
193
194







195
196
197
198
199
200
201
182
183
184
185
186
187
188

189
190




191
192
193
194
195
196
197
198
199
200
201
202
203
204







-

+
-
-
-
-
+
+
+
+
+
+
+







			    (exit 1))
		     (car (db:get-rows all))))
	 (header   (db:get-header all))
	 (id       (db:get-value-by-header first header "id"))
	 (hostname (db:get-value-by-header first header "hostname"))
	 (pid      (db:get-value-by-header first header "pid"))
	 (priority (db:get-value-by-header first header "priority")))
    (debug:print 0 "INFO: am-i-the-server got record " first)
    ;; for now a basic check. add tiebreaking by priority later
    (let* ((my-pid (current-process-id))
    (if (and (equal? hostname (get-host-name))
	     (equal? pid      (current-process-id)))
	id
	#f)))
	   (res (if (and (equal? hostname (get-host-name))
			 (equal? pid       my-pid))
		    id
		    #f)))
      (debug:print 0 "INFO: am-i-the-server got record " first ", my-pid: " my-pid ", pid: " pid ", result: " res)
      res)))
      
	     
;; Use: (db:get-value-by-header (car (db:get-rows dat)) (db:get-header dat) "fieldname")
;;  to extract info from the structure returned
;;
(define (tasks:server-get-servers-vying-for-run-id mdb run-id)
   (let* ((header (list "id" "hostname" "pid" "interface" "port" "pubport" "state" "run_id" "priority" "start_time"))
	  (selstr (string-intersperse header ","))

Modified tests/fullrun/megatest.config from [c76d4b28b2] to [bc391c991d].

75
76
77
78
79
80
81

82
83
84
85
86
87
88
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89







+







[validvalues]
state start end 0 1 - 2
status pass fail n/a 0 1 running - 2

# These are set before all tests, override them 
# in the testconfig [pre-launch-env-overrides] section
[env-override]

# This variable is honored by the loadrunner script. The value is in percent
# a value of 200 will stop new jobs from starting.
MAX_ALLOWED_LOAD 200

# MT_XTERM_CMD overrides the terminal command
# MT_XTERM_CMD xterm -bg lightgreen -fg black

101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
102
103
104
105
106
107
108




109
110
111
112
113
114
115







-
-
-
-







# The empty var should have a definition with null string
EMPTY_VAR

WRAPPEDVAR This var should have the work blah thrice: \
blah \
blah

# Set MAX_ALLOWED_LOAD for nbload. 150 percent is a good value.

MAX_ALLOWED_LOAD 150

# XTERM   [system xterm]
# RUNDEAD [system exit 56]

[server]

# If the server can't be started on this port it will try the next port until
# it succeeds