Megatest

Check-in [7dcbd017e3]
Login
Overview
Comment:Added better fallback on communication failure.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: 7dcbd017e3cc12525fbffde21829cf66bd3339d4
User & Date: matt on 2023-04-08 11:48:18
Other Links: branch diff | manifest | tags
Context
2023-04-09
14:45
Clean up uses of tcp. tcp6 did not seem to work. Increased tcp backlog (didn't seem to help) and improved backoff check-in: 4860a4e6aa user: matt tags: v1.80
2023-04-08
11:48
Added better fallback on communication failure. check-in: 7dcbd017e3 user: matt tags: v1.80
02:45
Removed assert from simple lock. Removed use of statement cache which fixed bind issues check-in: 532f050f93 user: matt tags: v1.80
Changes

Modified tcp-transportmod.scm from [a71da4bf27] to [32789e6d1a].

179
180
181
182
183
184
185
186
187








188
189
190
191
192
193
194
195
196
197
198
199
200
201
202

203
204
205
206
207
208
209
179
180
181
182
183
184
185


186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

208
209
210
211
212
213
214
215







-
-
+
+
+
+
+
+
+
+














-
+







		 (begin
		   (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname)
		   (server-start-proc)
		   (tt-last-serv-start-set! ttdat (current-seconds))))
	     (thread-sleep! 1)
	     (tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))

(define (tt:ping host port server-id)
  (let*  ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id
(define (tt:ping host port server-id #!optional (tries-left 5))
  (let*  ((res      (tt:send-receive-direct host port `(ping #f #f #f))) ;; please send me your server-id
	  (try-again (lambda ()
		       (if (> tries-left 0)
			   (begin
			     (thread-sleep! 1)
			     (tt:ping host port server-id (- tries-left 1)))
			   #f))))
    ;;
    ;; need two threads, one a 5 second timer
    ;;
    (match res
      ((status errmsg result meta)
       (if (equal? result server-id)
	   (let* ((server-state (alist-ref 'sstate meta)))
	     ;; (debug:print 0 *default-log-port* "Ping to "host":"port" successful.")
	     (or server-state 'unk)) ;; then we are good
	   (begin
	     (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result)
	     #f)))
      (else
       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
       #f))))
       (try-again)))))

;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)
  ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now.
228
229
230
231
232
233
234
235
236


237
238
239
240
241
242
243
244





245
246
247
248
249
250
251
252
253
254
255
256



















257
258

259
260
261
262
263
264
265
234
235
236
237
238
239
240


241
242
243
244
245
246
247
248
249
250
251
252
253
254
255












256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275

276
277
278
279
280
281
282
283







-
-
+
+








+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
+







		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, will try again in a 1/4 second.")
		(thread-sleep! 0.25)
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else
	     (if (not res)
	    (else ;; did not receive properly formated result
	     (if (not res) ;; tt:handler is telling us that communication failed
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
			;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db
			(pid     (tt-conn-pid  conn))
			(servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname)))
		   (hash-table-set! (tt-conns ttdat) dbfname #f)
		   (if (file-exists? servinf)
		       (begin
			 (if (< attemptnum 5)
			     (begin
			       (thread-sleep! 1)
			       (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
			     (begin
			 (debug:print 0 *default-log-port* "INFO: no ping response from server "host":"port" for "dbfname)
			 (if (and (file-exists? servinf)
				  (> (- (current-seconds)(file-modification-time servinf)) 60))
			     (begin
			       (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
			       (handle-exceptions
				   exn
				 #f
				 (delete-file* servinf)))))
		       (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf))
		   (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
		 (assert #f "FATAL: tt:handler received bad data "res)))))
			       (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname)
			       (if (and (file-exists? servinf)
					(> (- (current-seconds)(file-modification-time servinf)) 60))
				   (begin
				     (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.")
				     (handle-exceptions
					 exn
				       #f
				       (delete-file* servinf))
				     (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))))
		       (begin ;; no server file, delay and try again
			 (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)
			 (thread-sleep! 1)
			 (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))))
		 (begin ;; this case is where res is malformed. Probably should abort
		   (assert #f "FATAL: tt:handler received bad data "res)
		   ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.")
		   ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)
		   )))))
	(begin
	  (thread-sleep! 1) ;; give it a rest and try again
	  (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again
	  (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe)))))

(define (tt:bid-for-servership run-id)
  #f)

;; gets server info and appends path to server file
;; sorts by age, oldest first

Added utils/load-the-db.scm version [46853b5895].































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
;; start the repl and then load this file

(define start-time (current-seconds))

(let loop ((last-print 0)
	   (num-calls  0))
  (let ((all-run-ids (rmt:get-all-run-ids))
	(do-print    (> (- (current-seconds) last-print) 2))
	(max-query   0))
    (for-each
     (lambda (run-id)
       ;; (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)
       (let* ((all-run-data (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f #f #f)))
	 (for-each
	  (lambda (testdat)
	    (let* ((test-id (vector-ref testdat 0))
		   (start-at (current-milliseconds))
		   (testinfo (rmt:get-test-info-by-id run-id test-id))
		   (query-time (- (current-milliseconds) start-at)))
	      (if (> query-time max-query)
		  (set! max-query query-time))))
	  all-run-data)
	 (if do-print
	     (print "Running "(- (current-seconds) start-time)"s, run "run-id" has "(length all-run-data)" tests, max query "max-query))))
     all-run-ids)
    (loop (if do-print
	      (current-seconds)
	      last-print)
	  (+ num-calls (length all-run-ids)))))