Megatest

Check-in [0710176e69]
Login
Overview
Comment:No more starting servers for not main.db
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.80-revolution
Files: files | file ages | folders
SHA1: 0710176e691632fa3a64ae786e907aa72e245249
User & Date: matt on 2023-11-12 18:51:56
Other Links: branch diff | manifest | tags
Context
2023-11-13
14:55
Trimmed and cleaned up some rmt and tcp-transport code in prep for rip up and retry check-in: aa4d6364a1 user: mrwellan tags: v1.80-revolution
2023-11-12
18:51
No more starting servers for not main.db check-in: 0710176e69 user: matt tags: v1.80-revolution
2023-11-11
18:35
Cherrypicked cddccb0, 4310, (kinda) passes list-runs check-in: 0e0e9f5e2d user: matt tags: v1.80-revolution
Changes

Modified tcp-transportmod.scm from [f3861613c7] to [f8e0cc5988].

133
134
135
136
137
138
139



140
141
142
143
144

145
146
147
148
149
150
151
;; connecting to a server
;; 
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
  (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
  (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
	 (server-start-proc (lambda ()



			      (tt:server-process-run
			       (tt-areapath ttdat)
			       testsuite ;; (dbfile:testsuite-name)
			       (common:find-local-megatest)
			       run-id))))

    (if conn
	(begin 
          (debug:print-info 2 *default-log-port* "already connected to a server")
           conn) ;; we are already connected to the server
	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
	  (match sdat
	    ((host port start-time server-id pid dbfname2 servinffile)







>
>
>




|
>







133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
;; connecting to a server
;; 
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
  (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
  (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
  (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
	 (server-start-proc (lambda ()
			      (print "dbfname: "dbfname)
			      (assert (equal? dbfname "main.db") ;; only main.db is started here
				      "FATAL: called server-start-proc for db other than main.db")
			      (tt:server-process-run
			       (tt-areapath ttdat)
			       testsuite ;; (dbfile:testsuite-name)
			       (common:find-local-megatest)
			       run-id))
			    ))
    (if conn
	(begin 
          (debug:print-info 2 *default-log-port* "already connected to a server")
           conn) ;; we are already connected to the server
	(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
	  (match sdat
	    ((host port start-time server-id pid dbfname2 servinffile)
714
715
716
717
718
719
720





721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736

737
738
739
740
741
742
743
;; get servers listed, last part of name must match :<dbfname>
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other :<dbfname> files
;;
(define (tt:find-server areapath dbfname)
  (let* ((servdir  (tt:get-servinfo-dir areapath))
	 (sfiles   (glob (conc servdir"/*:"dbfname))))





    sfiles))

;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
;;
(define (tt:server-get-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; WARNING: this is potentially dangerous to blanket ignore the errors
			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn))

			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))
       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))







>
>
>
>
>














|
|
>







718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
;; get servers listed, last part of name must match :<dbfname>
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other :<dbfname> files
;;
(define (tt:find-server areapath dbfname)
  (let* ((servdir  (tt:get-servinfo-dir areapath))
	 (sfiles   (glob (conc servdir"/*:"dbfname))))
    
    ;; filter the files here by looking in processes table (if we are not main.db)
    ;; and or look at the time stamp on the servinfo file, a running server will
    ;; touch the file every minute (again, this will only apply for main.db)
    
    sfiles))

;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
;; example of what it's looking for in the log file:
;;     SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 
;;
(define (tt:server-get-info logf)
  (let ((server-rx    (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
        (dbprep-rx    (regexp "^SERVER: dbprep"))
        (dbprep-found 0)
	(bad-dat      (list #f #f #f #f #f #f logf)))
     (let ((fdat     (handle-exceptions
			 exn
		       (begin
			 ;; BUG, TODO: add err checking, for now blanket ignore the errors?
			 (debug:print-info 0 *default-log-port* "Unable to get server info from "logf
					   ", exn="(condition->list exn))
			 '()) ;; no idea what went wrong, call it a bad server, return empty list
		       (with-input-from-file logf read-lines))))
       (if (null? fdat) ;; bad data, return bad-dat
	   bad-dat
	   (let loop ((inl  (car fdat))
		      (tail (cdr fdat))
		      (lnum 0))
771
772
773
774
775
776
777
778

779
780
781
782
783

784
785
786

787
788
789

790
791
792
793
794
795
796
(define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
  (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
  (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
  (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
  ;; mtest -server - -m testsuite:ext-tests -db 6.db
  (let* ((dbfname  (dbmod:run-id->dbfname run-id))
	 (load     (get-normalized-cpu-load))
	 (trying   (length (tt:find-server areapath dbfname)))

	 (nrun     (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
    (cond
     ((> load 2.0)
      (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.")
      (thread-sleep! 1))

     ((> nrun 100)
      (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
      (thread-sleep! 1))

     ((> trying 4)
      (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
      (thread-sleep! 1))

     (else
      (if (not (file-exists? (conc areapath"/logs")))
	      (create-directory (conc areapath"/logs") #t))
	  (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
		 (cmdln     (conc
			     mtexe
			     " -startdir "areapath







|
>



|
|
>


|
>
|

|
>







781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
(define  (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
  (assert areapath  "FATAL: tt:server-process-run called without areapath defined.")
  (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
  (assert mtexe     "FATAL: tt:server-process-run called without mtexe defined.")
  ;; mtest -server - -m testsuite:ext-tests -db 6.db
  (let* ((dbfname  (dbmod:run-id->dbfname run-id))
	 (load     (get-normalized-cpu-load))
	 (srvrs    (tt:find-server areapath dbfname))
	 (trying   (length srvrs))
	 (nrun     (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
    (cond
     ((> load 2.0)
      (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
      (thread-sleep! 1)
      #f)
     ((> nrun 100)
      (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
      (thread-sleep! 1)
      #f)
     ((> trying 2)
      (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
      (thread-sleep! 1)
      #f)
     (else
      (if (not (file-exists? (conc areapath"/logs")))
	      (create-directory (conc areapath"/logs") #t))
	  (let* ((logfile   (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
		 (cmdln     (conc
			     mtexe
			     " -startdir "areapath
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
	    ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
	    ;; (setenv "NBFAKE_LOG" logfile)
	    ;; (system (conc "cd "areapath" ; nbfake " cmdln))
	    ;; (unsetenv "NBFAKE_QUIET")
	    ;; (unsetenv "NBFAKE_LOG")
	    
	    ;;(pop-directory)
	    )))))

;;======================================================================
;; tcp connection stuff
;;======================================================================

;; find a port and start tcp-server. This only starts the tcp portion of
;; the server, look at (tt:start-server ...) above for the entry point







|







823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
	    ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
	    ;; (setenv "NBFAKE_LOG" logfile)
	    ;; (system (conc "cd "areapath" ; nbfake " cmdln))
	    ;; (unsetenv "NBFAKE_QUIET")
	    ;; (unsetenv "NBFAKE_LOG")
	    
	    ;;(pop-directory)
	    #t)))))

;;======================================================================
;; tcp connection stuff
;;======================================================================

;; find a port and start tcp-server. This only starts the tcp portion of
;; the server, look at (tt:start-server ...) above for the entry point