Megatest

Changes On Branch e05028a28e535a07
Login

Changes In Branch v1.80-revolution-multi-server Through [e05028a28e] Excluding Merge-Ins

This is equivalent to a diff from d4f2f2c1ef to e05028a28e

2023-12-16
19:50
Merged 7372 check-in: f605e2b0d5 user: matt tags: v1.80-revolution-multi-server
2023-12-04
05:44
Bumped version check-in: f603771e69 user: mrwellan tags: v1.80-revolution, v1.8023
2023-12-03
22:12
more prep for multiple servers per db check-in: e05028a28e user: matt tags: v1.80-revolution-multi-server
17:49
Added todo items check-in: d4f2f2c1ef user: matt tags: v1.80-revolution, v1.8023
02:16
Proper calling of exit cleanup for servers. Go back to nbfake for running servers (proper logs kept). Remove .servinfo file for a server that does not respond to ping (returns #f). check-in: 2725343c92 user: matt tags: v1.80-revolution

Modified rmt.scm from [c3f010a183] to [478ec22375].

96
97
98
99
100
101
102
103













104
105
106
107
108
109
110
96
97
98
99
100
101
102

103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122







-
+
+
+
+
+
+
+
+
+
+
+
+
+







				#f
				(lambda ()
				  ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname)
				  (rmt:start-server ;; tt:server-process-run
				   areapath
				   testsuite ;; (dbfile:testsuite-name)
				   mtexe
				   run-id)))))
				   run-id))))
	 ;; current method does not take advantage of simply getting the list of
	 ;; servers from no-sync db. srv-get-proc would be a first step but is not used yet
	 (srv-get-proc (lambda ()
			 (let* ((candidates (rmt:get-process-options "server" dbfname))
				(ccount     (length candidates)))
			   (case ccount
			     ((0) #f) ;; need to call rmt:start-server
			     ((1) (car candidates))
			     (else
			      (if (> (random 100) 50)
				  (car candidates)
				  (cadr candidates))))))))
    ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it
    ;; and if there is no conn we first send a request to the main.db server to start a
    ;; server for the dbfname.
    #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request
	(begin
	  (server-start-proc)
	  (thread-sleep! 1)))

Modified tcp-transportmod.scm from [5cef09f36d] to [108f76cedd].

149
150
151
152
153
154
155
156
157
158
159





160
161
162
163
164
165
166
149
150
151
152
153
154
155




156
157
158
159
160
161
162
163
164
165
166
167







-
-
-
-
+
+
+
+
+







				   testsuite ;; (dbfile:testsuite-name)
				   (common:find-local-megatest)
				   run-id)))))
    (if conn
	(begin 
          (debug:print-info 2 *default-log-port* "already connected to a server")
           conn) ;; we are already connected to the server
	(let* ((sdats (tt:get-server-info-sorted ttdat dbfname))
	       (sdat  (if (null? sdats)
			  #f
			  (car sdats))))
	(let* (;; (sdats (tt:get-server-info-sorted ttdat dbfname))
	       ;; (sdat  (if (null? sdats)
	       ;;	  #f
	       ;;	  (car sdats))))
	       (sdat (tt:get-valid-server-random ttdat dbfname)))
	  (match sdat
	    ((host port start-time server-id pid dbfname2 servinffile)
	     (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.")
             (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile)
	     (let* ((host-port (conc host":"port))
		    (conn (make-tt-conn
			   host: host
347
348
349
350
351
352
353
354




















355
356
357
358
359
360
361
348
349
350
351
352
353
354

355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381







-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







     (lambda (rec)
       (if (or (> (length sorted) 1)
	       (common:low-noise-print 120 "server info sorted"))
	   (debug:print 2 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", ")))
       (set! count (+ count 1)))
     sorted)
    sorted))
    

(define (tt:get-valid-server-random ttdat dbfname)
  (let* ((candidates (tt:get-server-info-sorted ttdat dbfname))
	 (numc       (length candidates)))
    (case numc
      ((0) #f)
      ((1) (car candidates))
      (else
       (let* ((firsthost        (caar candidates))
	      (valid-candidates (filter (lambda (x)(equal? (car x) firsthost)) candidates))
	      (numvalid         (length valid-candidates)))
	 (case numvalid
	   ((0) (debug:print 0 *default-log-port* "ERROR: code issue, filter broke?") #f)
	   ((1) (car valid-candidates))
	   (else
	    ;; expand logic here to support more than two servers
	    (if (> (random 100) 50)
		(car valid-candidates)
		(cadr valid-candidates)))))))))
   
(define (tt:send-receive ttdat conn cmd run-id params)
  (let* ((host-port (tt-conn-host-port conn)) ;; (conc (tt-conn-host conn)":"(tt-conn-port conn)))
	 (host      (tt-conn-host conn))
	 (port      (tt-conn-port conn))
	 (dat       (list cmd run-id params #f))) ;; no meta data yet
    (tt:send-receive-direct host port dat)))

500
501
502
503
504
505
506



507

508

509
510
511
512
513
514
515
520
521
522
523
524
525
526
527
528
529
530
531

532
533
534
535
536
537
538
539







+
+
+

+
-
+







		;; contact servers via ping, if no response remove the .servinfo file
		(let loop ((servrs     servers)
			   (prime-host #f)
			   (result    '()))
		  (if (null? servrs)
		      (reverse result)
		      (let* ((servdat (car servrs)))
	;;; INFO: (0) 23:08:10 ERROR: bad servinfo record
	;;; "(127.0.1.1 36797 1701662813.0 88fff570fa3996d6082df8a1875e6cb1 15462 6.db /home/matt/data/megatest/ext-tests/sixtyfivek/.servinfo/127.0.1.1:36797-15462:6.db)"
			     
			(match servdat
			  ;; host port startt server-id pid dbfname servinffilr
			     ((host port startseconds server-id servinfofile)
			  ((host port startseconds server-id pid dbfname servinfofile)
			      (let* ((ping-res  (tt:timed-ping host port server-id))
				     (good-ping (match ping-res
						   ((result . ping-time)
						    (not result)) ;; we couldn't reach the server or it was not a megatest server
						   (else #f))) ;; the ping failed completely?
				     (same-host (or (not prime-host) ;; i.e. this is the first host
						    (equal? prime-host host)))