︙ | | |
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
-
+
+
-
+
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
;;======================================================================
(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
;; how to make area-dat
(define (rmt:set-ttdat areapath ttdat)
(if (not ttdat)
(if ttdat
ttdat
(let* ((newremote (make-and-init-remote areapath)))
(set! *ttdat* newremote)
ttdat)))
newremote)))
;; NB// area-dat replaced by ttdat
;;
(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f))
(assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.")
(assert ttdat "FATAL: rmt:send-receive must receive initialized area-dat")
(let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value
(areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas
(attemptnum (+ 1 attemptnum))
(readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*))
(testsuite (common:get-testsuite-name))
(mtexe (common:find-local-megatest))
(dbfname (conc (dbfile:run-id->dbnum run-id)".db")))
(rmt:set-ttdat areapath ttdat)
(tt:handler (rmt:set-ttdat ttdat) cmd run-id params
attemptnum readonly-mode dbfname
testsuite mtexe)))
(dbfname (conc (dbfile:run-id->dbnum run-id)".db"))
(ttdat (rmt:set-ttdat areapath ttdat))
(conn (tt:get-conn ttdat dbfname))
(is-main (equal? dbfname "main.db")) ;; why not (not run-id) ?
(server-start-proc (if is-main
#f
(lambda ()
;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname)
(rmt:start-server ;; tt:server-process-run
areapath
testsuite ;; (dbfile:testsuite-name)
mtexe
run-id)))))
;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it
;; and if there is no conn we first send a request to the main.db server to start a
;; server for the dbfname.
#;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request
(begin
(server-start-proc)
(thread-sleep! 1)))
(tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)))
;; KEEP THIS HERE FOR A WHILE, WE MAY WANT TO RESURECT IT
;; (define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe)
;; (let* ((keys (common:get-fields *configdat*))
;; (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard")))
;; (api:dispatch-request dbstruct cmd run-id params)))
|
︙ | | |
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
|
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
|
-
-
+
+
-
+
-
+
|
;;======================================================================
;; S E R V E R
;;======================================================================
(define (rmt:kill-server run-id)
(rmt:send-receive 'kill-server run-id (list run-id)))
(define (rmt:start-server run-id)
(rmt:send-receive 'start-server 0 (list run-id)))
(define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server
(rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id)))
;;======================================================================
;; M I S C
;;======================================================================
(define (rmt:login run-id)
(rmt:send-receive 'login run-id (list *toppath* megatest-version (client:get-signature))))
;; This login does no retries under the hood - it acts a bit like a ping.
;; Deprecated for nmsg-transport.
;;
;; (define (rmt:login-no-auto-client-setup runremote)
;; (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature))))
;; (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature))))
;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host
(define (rmt:get-latest-host-load hostname)
(rmt:send-receive 'get-latest-host-load 0 (list hostname)))
(rmt:send-receive 'get-latest-host-load #f (list hostname)))
(define (rmt:sdb-qry qry val run-id)
;; add caching if qry is 'getid or 'getstr
(rmt:send-receive 'sdb-qry run-id (list qry val)))
;; NOT COMPLETED
(define (rmt:runtests user run-id testpatt params)
|
︙ | | |
︙ | | |
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
|
+
-
+
+
+
+
-
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
-
|
;; parameters
;;
(define tt-server-timeout-param (make-parameter 600))
;; make ttdat visible
(define *server-info* #f)
(define *server-run* #t)
(define (tt:make-remote areapath)
(make-tt areapath: areapath))
;; 1 ... or #f
;; and check that dbfname matches. FIXME: the propagation of dbfname and run-id
;; might not make the best sense
;;
(define (tt:valid-run-id run-id dbfname)
(and (or (number? run-id)
(not run-id))
(equal? (dbfile:run-id->dbfname run-id) dbfname)))
(tcp-buffer-size 2048)
;; (max-connections 4096)
;; (max-connections 4096)
(define (tt:get-conn ttdat dbfname)
(hash-table-ref/default (tt-conns ttdat) dbfname #f))
;; do all the busy work of finding and setting up conn for
;; connecting to a server
;;
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite)
(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)
(assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id)
(debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id)
(let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f))
(server-start-proc (lambda ()
(print "dbfname: "dbfname)
(assert (equal? dbfname "main.db") ;; only main.db is started here
"FATAL: called server-start-proc for db other than main.db")
(tt:server-process-run
(tt-areapath ttdat)
testsuite ;; (dbfile:testsuite-name)
(common:find-local-megatest)
run-id))
(let* ((conn (tt:get-conn ttdat dbfname))
(server-start-proc (or server-start-proc
(lambda ()
(assert (equal? dbfname "main.db") ;; only main.db is started here
"FATAL: called server-start-proc for db other than main.db")
(tt:server-process-run
(tt-areapath ttdat)
testsuite ;; (dbfile:testsuite-name)
(common:find-local-megatest)
run-id)))))
))
(if conn
(begin
(debug:print-info 2 *default-log-port* "already connected to a server")
conn) ;; we are already connected to the server
(let* ((sdat (tt:get-current-server-info ttdat dbfname)))
(match sdat
((host port start-time server-id pid dbfname2 servinffile)
|
︙ | | |
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
|
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
|
-
+
-
+
-
+
|
((running)
(debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table")
(hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good?
conn)
((starting)
(thread-sleep! 0.5)
(debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect")
(tt:client-connect-to-server ttdat dbfname run-id testsuite))
(tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))
(else
(let* ((curr-secs (current-seconds)))
;; rm the (last server) would go here
(if (> (- curr-secs (tt-last-serv-start ttdat)) 10)
(begin
(tt-last-serv-start-set! ttdat curr-secs)
(server-start-proc))) ;; start server if 10 sec since last attempt
(thread-sleep! 1)
(debug:print-info 2 *default-log-port* "server ping result was neither running nor starting. Retrying connect")
(tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
(tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
(else ;; no good server found, if haven't started server in > 5 secs, start another
(if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers
(begin
(debug:print-info 0 *default-log-port* "Starting server for "dbfname)
(server-start-proc)
(tt-last-serv-start-set! ttdat (current-seconds))
(thread-sleep! 3)
))
(thread-sleep! 1)
(debug:print-info 0 *default-log-port* "Connect to server for " dbfname)
(tt:client-connect-to-server ttdat dbfname run-id testsuite)))))))
(tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)))))))
(define (tt:timed-ping host port server-id)
(let* ((start-time (current-milliseconds))
(result (tt:ping host port server-id)))
(cons result (- (current-milliseconds) start-time))))
|
︙ | | |
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
|
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
|
-
+
-
+
|
;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res)
(try-again)))))
;; client side handler
;;
;;(tt:handler #<tt> get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest")
;;
(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe)
(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc)
;; connect-to-server will start a server if needed.
(let* ((areapath (tt-areapath ttdat))
(conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; looks up conn keyed by dbfname
(conn (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname
(if conn
;; have connection, call the server
(let* ((res (tt:send-receive ttdat conn cmd run-id params)))
;; res is (status errmsg result meta)
(match res
((status errmsg result meta)
(if (list? meta)
|
︙ | | |
558
559
560
561
562
563
564
565
566
567
568
569
570
571
|
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
|
+
+
+
|
(tt:create-server-registration-file ttdat dbfname)
;; now start watching the last-access, if it hasn't been touched
;; in over ten seconds we exit
(thread-sleep! 0.05) ;; any real need for delay here?
(let loop ()
(let* ((servers (tt:get-server-info-sorted ttdat dbfname))
(ok (cond
((not *server-run*)
(debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.")
#f)
((null? servers) #f) ;; not ok
((equal? (list-ref (car servers) 6) ;; compare the servinfofile
(tt-servinf-file ttdat))
(let* ((res (if db-locked-in
#t
(let* ((lock-result ;; this is the primary lock - need to double verify that got it
(dbfile:with-no-sync-db
|
︙ | | |
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
|
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
|
-
-
+
+
+
|
(cleanup)
(exit)))
(let* ((last-update (dbr:dbstruct-last-update dbstruct))
(curr-secs (current-seconds)))
(if (and (eq? (tt-state ttdat) 'running)
(> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db?
(begin
(set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds))
(let* ((sinfo-file (tt-servinf-file ttdat)))
;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file)
(set! (file-modification-time sinfo-file) (current-seconds))
((dbr:dbstruct-sync-proc dbstruct) last-update)
(dbr:dbstruct-last-update-set! dbstruct curr-secs))))
(if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param))
(begin
(thread-sleep! 5)
(loop)))))
|
︙ | | |
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
|
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
|
-
-
+
+
+
-
-
+
+
+
+
+
+
+
+
+
|
;; find valid server
;; get servers listed, last part of name must match :<dbfname>
;; if more than one, wait one second and look again
;; future: ping oldest, if alive remove other :<dbfname> files
;;
(define (tt:find-server areapath dbfname)
(let* ((servdir (tt:get-servinfo-dir areapath))
(sfiles (glob (conc servdir"/*:"dbfname))))
(sfiles (glob (conc servdir"/*:"dbfname)))
(goodfiles '()))
;; filter the files here by looking in processes table (if we are not main.db)
;; and or look at the time stamp on the servinfo file, a running server will
;; touch the file every minute (again, this will only apply for main.db)
sfiles))
(for-each (lambda (fname)
(let* ((age (- (current-seconds)(file-modification-time fname))))
(if (> age 10) ;; can't trust it if over ten seconds old
(begin
(debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname)
(delete-file fname))
(set! goodfiles (cons fname goodfiles)))))
sfiles)
goodfiles))
;; given a path to a server info file return: host port startseconds server-id pid dbfname logf
;; example of what it's looking for in the log file:
;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4
;;
(define (tt:server-get-info logf)
(let ((server-rx (regexp "^SERVER STARTED: (\\S+):(\\d+) AT ([\\d\\.]+) server-id: (\\S+) pid: (\\d+) dbfname: (\\S+)")) ;; SERVER STARTED: host:port AT timesecs server id
|
︙ | | |
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
|
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
|
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
|
(string->number pid)
dbfname
logf))
(else
(debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst)
bad-dat)))))))))
(define *last-server-start* (make-hash-table))
(define (tt:too-recent-server-start dbfname)
(let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f)))
(and last-run-time
(< (- (current-seconds) last-run-time) 5))))
;; Given an area path, start a server process ### NOTE ### > file 2>&1
;; if the target-host is set
;; try running on that host
;; incidental: rotate logs in logs/ dir.
;;
(define (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area
(assert areapath "FATAL: tt:server-process-run called without areapath defined.")
(assert testsuite "FATAL: tt:server-process-run called without testsuite defined.")
(assert mtexe "FATAL: tt:server-process-run called without mtexe defined.")
;; mtest -server - -m testsuite:ext-tests -db 6.db
(let* ((dbfname (dbmod:run-id->dbfname run-id))
(load (get-normalized-cpu-load))
(srvrs (tt:find-server areapath dbfname))
(trying (length srvrs))
(nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
(cond
((> load 2.0)
(debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
(thread-sleep! 1)
#f)
((> nrun 100)
(debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
(thread-sleep! 1)
#f)
((> trying 2)
(debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
(thread-sleep! 1)
#f)
(else
(if (not (file-exists? (conc areapath"/logs")))
(create-directory (conc areapath"/logs") #t))
(let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
(cmdln (conc
mtexe
" -startdir "areapath
" -server - ";; (or target-host "-")
" -m testsuite:"testsuite
" -db "dbfname ;; (dbmod:run-id->dbfname run-id)
" " profile-mode
(conc " >> " logfile " 2>&1 &"))))
;; we want the remote server to start in *toppath* so push there
;; (push-directory areapath) ;; use cd in the command line instead
(debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
(let* ((dbfname (dbmod:run-id->dbfname run-id)))
(if (tt:too-recent-server-start dbfname)
#f
(let* ((load (get-normalized-cpu-load))
(srvrs (tt:find-server areapath dbfname))
(trying (length srvrs))
(nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname))))
(cond
((> load 2.0)
(debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes")
(thread-sleep! 1)
#f)
((> nrun 100)
(debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.")
(thread-sleep! 1)
#f)
((> trying 2)
(debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.")
(thread-sleep! 1)
#f)
(else
(if (not (file-exists? (conc areapath"/logs")))
(create-directory (conc areapath"/logs") #t))
(let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))
(cmdln (conc
mtexe
" -startdir "areapath
" -server - ";; (or target-host "-")
" -m testsuite:"testsuite
" -db "dbfname ;; (dbmod:run-id->dbfname run-id)
" " profile-mode
(conc " >> " logfile " 2>&1 &"))))
;; we want the remote server to start in *toppath* so push there
;; (push-directory areapath) ;; use cd in the command line instead
(debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath)
;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time))
(system cmdln)
;; ;; use below to go back to nbfake - nbfake does cause trouble ...
;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
;; (setenv "NBFAKE_LOG" logfile)
;; (system (conc "cd "areapath" ; nbfake " cmdln))
;; (unsetenv "NBFAKE_QUIET")
;; (unsetenv "NBFAKE_LOG")
;;(pop-directory)
#t)))))
(system cmdln)
(hash-table-set! *last-server-start* dbfname (current-seconds))
;; ;; use below to go back to nbfake - nbfake does cause trouble ...
;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ...
;; (setenv "NBFAKE_LOG" logfile)
;; (system (conc "cd "areapath" ; nbfake " cmdln))
;; (unsetenv "NBFAKE_QUIET")
;; (unsetenv "NBFAKE_LOG")
;;(pop-directory)
#t)))))))
;;======================================================================
;; tcp connection stuff
;;======================================================================
;; find a port and start tcp-server. This only starts the tcp portion of
;; the server, look at (tt:start-server ...) above for the entry point
|
︙ | | |