Megatest

Changes On Branch 66e4d2383a4e8dd4
Login

Changes In Branch v1.81-multi-server Through [66e4d2383a] Excluding Merge-Ins

This is equivalent to a diff from 6b8bacba76 to 66e4d2383a

2024-07-08
12:55
CI/CD: Automated commit after successful test, build, and deploy for v1.81-fix-api-changed check-in: 0249193b68 user: ramartin tags: v1.81
2024-07-03
19:06
Multi-servers working. Needs polish but the machine didn't overload on > 10 parallel runs of sixtyfivek check-in: 16d75bb8f1 user: matt tags: v1.81-multi-server
12:50
Fixed an unbound variable and made it handle 1.81. Leaf check-in: dc561cf357 user: mmgraham tags: v1.81-fix-api-changed
2024-07-02
13:55
Start on increasing allowed number of servers for a db file check-in: 66e4d2383a user: mrwellan tags: v1.81-multi-server
2024-07-01
17:27
Changed version to v1.8101 check-in: 6b8bacba76 user: mmgraham tags: v1.81, v1.8101
16:57
v1.81-test-control-panel branch into 1.81 check-in: a6b2094831 user: ramartin tags: v1.81

Modified tcp-transportmod.scm from [2d80f8e52f] to [f5588f1f60].

301
302
303
304
305
306
307
308

309
310
311
312
313
314
315
301
302
303
304
305
306
307

308
309
310
311
312
313
314
315







-
+







      (else
       ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
       (try-again)))))

;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
;;g
(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)
  ;; connect-to-server will start a server if needed.
  (let* ((areapath (tt-areapath ttdat))
	 (conn     (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname
    (if conn
	;; have connection, call the server
	(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
592
593
594
595
596
597
598
599

600
601
602
603
604
605
606
592
593
594
595
596
597
598

599
600
601
602
603
604
605
606







-
+







			      #f
			      (caar good-srvrs))))
	  ;; by here we have a trustworthy list of servers and we have removed the .servinfo file for any unresponsive servers
	  ;; and the list is in good-srvrs
	  (cond
	   ((not home-host) ;; no servers yet, go ahead and start
	    (debug:print-info 0 *default-log-port* "No servers yet, starting on "(get-host-name)))
	   ((> (length good-srvrs) 2) ;; don't need more, just exit
	   ((> (length good-srvrs) 3) ;; don't need more, just exit
	    (debug:print-info 0 *default-log-port* "Have "(length good-srvrs)", no need for more, exiting.")
	    (exit))
	   ((not (equal? home-host (get-host-name))) ;; there is a home-host and we are not on it
	    (debug:print-info 0 *default-log-port* "Prime main server is on host "home-host", but we are on host "(get-host-name)", exiting.")
	    (exit))
	   (else
	    (debug:print-info 0 *default-log-port* "Starting on host "(get-host-name)", along with "(length good-srvrs)" other servers.")))
620
621
622
623
624
625
626


627
628
629
630
631
632
633
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635







+
+







		      (thread-sleep! 0.25)
		      (loop (+ count 1)))
		    (begin
		      (debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set! Exiting.")
		      (exit)))))
	  
	  ;; create a servinfo file start keep-running
	  ;; On WSL there seems to be a race condition where the .servinfo file
	  ;; is not created fast enough
          (debug:print 0 *default-log-port* "Creating servinfo file for " dbfname)
	  (tt:create-server-registration-file ttdat dbfname)
	  (procinf-status-set! *procinf* "running")
	  (tt-state-set! ttdat 'running)
	  (dbfile:with-no-sync-db
	   nosyncdbpath
	   (lambda (nsdb)
662
663
664
665
666
667
668



669
670
671
672
673
674
675
676


677
678
679
680
681
682
683
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679


680
681
682
683
684
685
686
687
688







+
+
+






-
-
+
+







			    #f
			    (caar servers)))
	     (my-index  (list-index (lambda (x)
				      (equal? (list-ref x 6)
					      (tt-servinf-file ttdat)))
				    servers))
	     (ok         (cond
			  ((not my-index)
			   (debug:print 0 *default-log-port* "WARNING: Apparently I don't exist.")
			   'not-yet) ;; keep trying ?
			  ((not *server-run*)
			   (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.")
			   #f)
			  ((null? servers)
			   (debug:print 0 *default-log-port* "WARNING: no servinfo files found, this cannot be.")
			   #f) ;; not ok
			  ((> my-index 2)
			   (debug:print 0 *default-log-port* "WARNING: there are more than two servers ahead of me, I'm not needed, exiting.")
			  ((> my-index 3)
			   (debug:print 0 *default-log-port* "WARNING: there are more than three servers ahead of me, I'm not needed, exiting.")
			   #f) ;; not ok to not be in first three
			  ((eq? (tt-state ttdat) 'running) #t) ;; we are good to keep going
			  ((> (- (current-seconds) start-time) 30)
			   (debug:print 0 *default-log-port* "WARNING: over 30 seconds and not yet in runnning mode. Exiting.")
			   #f)
			  (else #t))))
	(if ok
726
727
728
729
730
731
732







733

734
735
736
737
738
739
740
731
732
733
734
735
736
737
738
739
740
741
742
743
744

745
746
747
748
749
750
751
752







+
+
+
+
+
+
+
-
+







	 (servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname))
	 (serv-id (tt:mk-signature areapath)))
    (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname)
    (tt-servinf-file-set! ttdat servinf)
    (with-output-to-file servinf
      (lambda ()
	(print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname)))
    (let loop ((count 0))
      (if (not (file-exists? servinf))
	  (begin
	    (debug:print 0 *default-log-port* "WARNING: file "servinf" was created but it doesn't show up on disk! We'll try again.")
	    (thread-sleep! 1)
	    (if (< count 10)
		(loop (+ count 1))))))
      serv-id))
    serv-id))

;; find valid server
;; get servers listed, last part of name must match :<dbfname>
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other :<dbfname> files
;;
(define (tt:find-server areapath dbfname)
827
828
829
830
831
832
833
834

835
836
837
838
839
840
841
839
840
841
842
843
844
845

846
847
848
849
850
851
852
853







-
+







	    (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
	    (thread-sleep! 1)
	    #f)
	   ((> nrun 100)
	    (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
	    (thread-sleep! 1)
	    #f)
	   ((> trying 2)
	   ((> trying 3)
	    (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
	    (thread-sleep! 1)
	    #f)
	   (else
	    (if (not (file-exists? (conc areapath"/logs")))
		(create-directory (conc areapath"/logs") #t))
	    (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))