Megatest

Check-in [9771b5d5a9]
Login
Overview
Comment:load control is working but servers are still getting wedged over time
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80
Files: files | file ages | folders
SHA1: 9771b5d5a97110ffe8a2cc963916648db3c2dd35
User & Date: matt on 2023-04-10 06:09:34
Other Links: branch diff | manifest | tags
Context
2023-04-10
07:58
fixed start up wedging check-in: ce4cc8997a user: matt tags: v1.80
06:09
load control is working but servers are still getting wedged over time check-in: 9771b5d5a9 user: matt tags: v1.80
2023-04-09
22:14
flag loaded at 50 threads. check-in: 4c1e85ecfb user: matt tags: v1.80
Changes

Modified tcp-transportmod.scm from [bee5aadbcc] to [f7ad6026cc].

229
230
231
232
233
234
235
236
237


238
239
240
241
242
243
244
229
230
231
232
233
234
235


236
237
238
239
240
241
242
243
244







-
-
+
+







			 (thread-sleep! delay-wait)))))
	     (case status
	       ((busy) ;; result will be how long the server wants you to delay
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is overloaded, will try again in "result" seconds.")
		(thread-sleep! (if (number? result) result 2))
		(tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       ((loaded)
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, will try again in a 1/4 second.")
		(tt:backoff-incr (tt-host conn)(tt-port conn))
		(debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.")
		(tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn))
		result) ;; (tt:handler  ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))
	       (else
		result)))
	    (else ;; did not receive properly formated result
	     (if (not res) ;; tt:handler is telling us that communication failed
		 (let* ((host    (tt-conn-host conn))
			(port    (tt-conn-port conn))
345
346
347
348
349
350
351
352

353
354
355
356
357
358
359

360

361
362
363
364
365
366
367
345
346
347
348
349
350
351

352
353
354
355
356
357
358
359
360

361
362
363
364
365
366
367
368







-
+







+
-
+







  (let* ((host-port (conc host":"port))
	 (bkoff     (hash-table-ref/default *tt:backoff-smoothing* host-port #f)))
    (if bkoff
	(let* ((wait-delay (tt:backoff-wait-delay bkoff))
	       (last-ioerr (tt:backoff-last-ioerr bkoff))
	       (last-adj-t (tt:backoff-last-adj-t bkoff))
	       (delta      (- (current-seconds) last-adj-t))
	       (adj        (* delta 0.01)) ;; it takes ten seconds to recover from hitting an io err
	       (adj        (* delta 0.001)) ;; it takes 100 seconds to recover from hitting an io err
	       (new-wait   (if (> wait-delay 0)
			       (if (> adj wait-delay)
				   0
				   (- wait-delay adj))
			       0)))
	  (if (> new-wait 0)
	      (begin
		(if (common:low-noise-print 10 "delay wait message")
		(debug:print-info 0 *default-log-port* "Server loaded, DelayWait: "new-wait)
		    (debug:print-info 0 *default-log-port* "Server loaded, DelayWait: "new-wait))
		(tt:backoff-wait-delay-set! bkoff new-wait)
		(tt:backoff-last-adj-t-set! bkoff (current-seconds))
		(thread-sleep! new-wait))
	      (hash-table-delete! *tt:backoff-smoothing* host-port))))))

(define (tt:send-receive-direct host port dat #!key (ping-mode #f)(tries-remaining 25))
  (assert (number? port) "FATAL: tt:send-receive-direct called with port not a number "port)
399
400
401
402
403
404
405
406

407
408
409
410
411
412
413
400
401
402
403
404
405
406

407
408
409
410
411
412
413
414







-
+







	  (full-err-print exn  "ERROR: i/o error")
	  (tt:backoff-incr host port)
	  #f)
     (exn (i/o net)
	  (if ping-mode
	      #f
	      (if (>= tries-remaining 0)
		  (let* ((backoff-delay (* (- 26 tries-remaining) 0.5)))
		  (let* ((backoff-delay (* (- 26 tries-remaining) 0.1)))
		    (debug:print 0 *default-log-port* "WARNING: TCP overload, trying again in "backoff-delay"s.")
		    (thread-sleep! backoff-delay)
		    (tt:backoff-incr host port)
		    (retry))
		  (assert #f "FATAL: Too many retries in tt:send-receive-direct"))))
     (exn ()
	  (full-err-print exn "Unhandled exception from client side.")

Modified utils/load-the-db.scm from [46853b5895] to [92b9fb2b93].

1
2
3
4
5
6
7
8
9







10
11
12
13

14
15
16
17
18
19
20
21
22
23





24


25
26
27
28
29

30
1
2
3
4
5
6



7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33

34
35
36
37
38
39

40
41






-
-
-
+
+
+
+
+
+
+




+










+
+
+
+
+
-
+
+




-
+

;; start the repl and then load this file

(define start-time (current-seconds))

(let loop ((last-print 0)
	   (num-calls  0))
  (let ((all-run-ids (rmt:get-all-run-ids))
	(do-print    (> (- (current-seconds) last-print) 2))
	(max-query   0))
  (let* ((all-run-ids (rmt:get-all-run-ids))
	 (do-print    (> (- (current-seconds) last-print) 2))
	 (max-query   0)
	 (num-calls   (+ num-calls
			 1                    ;; account for call above
			 (length all-run-ids) ;; account for the get-tests-for-run in the for-each below
			 )))
    (for-each
     (lambda (run-id)
       ;; (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)
       (let* ((all-run-data (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f #f #f)))
	 (set! num-calls (+ num-calls (length all-run-data)))
	 (for-each
	  (lambda (testdat)
	    (let* ((test-id (vector-ref testdat 0))
		   (start-at (current-milliseconds))
		   (testinfo (rmt:get-test-info-by-id run-id test-id))
		   (query-time (- (current-milliseconds) start-at)))
	      (if (> query-time max-query)
		  (set! max-query query-time))))
	  all-run-data)
	 (if do-print
	     (let* ((run-time (- (current-seconds) start-time))
		    (qry-rate (if (> run-time 0)
				  (inexact->exact (round (/  num-calls run-time)))
				  -1)))
		(print "Running "run-time"s, run "run-id
	     (print "Running "(- (current-seconds) start-time)"s, run "run-id" has "(length all-run-data)" tests, max query "max-query))))
		    " has "(length all-run-data)" tests, max query "max-query
		    "ms with avg query rate "qry-rate" qry/s")))))
     all-run-ids)
    (loop (if do-print
	      (current-seconds)
	      last-print)
	  (+ num-calls (length all-run-ids)))))
	  num-calls)))