Megatest

Diff
Login

Differences From Artifact [c28712df60]:

To Artifact [3c791622d1]:


60
61
62
63
64
65
66
67

68
69
70
71
72
73
74
60
61
62
63
64
65
66

67
68
69
70
71
72
73
74







-
+







(define *heartbeat-mutex* (make-mutex))

;;======================================================================
;; S E R V E R
;;======================================================================

(define (nmsg-transport:run dbstruct hostn run-id server-id #!key (retrynum 1000))
  (debug:print 2 "Attempting to start the server ...")
  (debug:print 2 #f "Attempting to start the server ...")
  (let* ((start-port      (portlogger:open-run-close portlogger:find-port))
	 (server-thread   (make-thread (lambda ()
					 (nmsg-transport:try-start-server dbstruct run-id start-port server-id))
				       "server thread"))
	 (tdbdat          (tasks:open-db)))
    (thread-start! server-thread)
    (thread-sleep! 0.1)
82
83
84
85
86
87
88
89

90
91
92
93
94

95
96
97
98
99
100
101
102

103
104

105
106
107
108
109
110
111
82
83
84
85
86
87
88

89
90
91
92
93

94
95
96
97
98
99
100
101

102
103

104
105
106
107
108
109
110
111







-
+




-
+







-
+

-
+







	  (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
	  (thread-start! (make-thread
			  (lambda ()(nmsg-transport:keep-running server-id run-id))
			  "keep running"))
	  (thread-join! server-thread))
	(if (> retrynum 0)
	    (begin
	      (debug:print 0 "WARNING: Failed to connect to server (self) on host " hostn ":" start-port ", trying again.")
	      (debug:print 0 #f "WARNING: Failed to connect to server (self) on host " hostn ":" start-port ", trying again.")
	      (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature")
	      (portlogger:open-run-close portlogger:set-failed start-port)
	      (nmsg-transport:run dbstruct hostn run-id server-id))
	    (begin
	      (debug:print 0 "ERROR: could not find an open port to start server on. Giving up")
	      (debug:print 0 #f "ERROR: could not find an open port to start server on. Giving up")
	      (exit 1))))))

(define (nmsg-transport:try-start-server dbstruct run-id portnum server-id)
  (let ((repsoc (nn-socket 'rep)))
    (nn-bind repsoc (conc "tcp://*:" portnum))
    (let loop ((msg-in (nn-recv repsoc)))
      (let* ((dat    (db:string->obj msg-in transport: 'nmsg)))
	(debug:print 0 "server, received: " dat)
	(debug:print 0 #f "server, received: " dat)
	(let ((result (api:execute-requests dbstruct dat)))
	  (debug:print 0 "server, sending: " result)
	  (debug:print 0 #f "server, sending: " result)
	  (nn-send repsoc (db:obj->string result  transport: 'nmsg)))
	(loop (nn-recv repsoc))))))

;; all routes though here end in exit ...
;;
(define (nmsg-transport:launch run-id)
  (let* ((tdbdat   (tasks:open-db))
120
121
122
123
124
125
126
127

128
129
130
131
132
133
134
135
136
137
138
139

140
141
142
143

144
145
146
147
148
149
150
120
121
122
123
124
125
126

127
128
129
130
131
132
133
134
135
136
137
138

139
140
141
142

143
144
145
146
147
148
149
150







-
+











-
+



-
+







    ;;       (daemon:ize)
    ;;       (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
    ;;           (begin
    ;;     	(current-error-port *alt-log-file*)
    ;;     	(current-output-port *alt-log-file*)))))
    (if (server:check-if-running run-id)
	(begin
	  (debug:print-info 0 "Server for run-id " run-id " already running")
	  (debug:print-info 0 #f "Server for run-id " run-id " already running")
	  (exit 0)))
    (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id))
	       (remtries  4))
      (if (not server-id)
	  (if (> remtries 0)
	      (begin
		(thread-sleep! 2)
		(if (not (server:check-if-running run-id))
		    (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id)
			  (- remtries 1))
		    (begin
		      (debug:print-info 0 "Another server took the slot, exiting")
		      (debug:print-info 0 #f "Another server took the slot, exiting")
		      (exit 0))))
	      (begin
		;; since we didn't get the server lock we are going to clean up and bail out
		(debug:print-info 2 "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(debug:print-info 2 #f "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
		(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
		))
	  ;; locked in a server id, try to start up
	  (nmsg-transport:run dbstruct hostn run-id server-id))
      (set! *didsomething* #t)
      (exit))))

182
183
184
185
186
187
188
189

190
191
192
193
194
195
196
182
183
184
185
186
187
188

189
190
191
192
193
194
195
196







-
+







	 (dat     (vector "ping" our-key))
	 (result  (condition-case 
		   (nmsg-transport:client-api-send-receive-raw req dat timeout: timeout)
		   ((timeout)(set! success #f) #f)))
	 (key     (if success 
		      (vector-ref result 1)
		      #f)))
    (debug:print 0 "success=" success ", key=" key ", expected-key=" expected-key ", equal? " (equal? key expected-key))
    (debug:print 0 #f "success=" success ", key=" key ", expected-key=" expected-key ", equal? " (equal? key expected-key))
    (if (and success
	     (or (not expected-key) ;; just getting a reply is good enough then
		 (equal? key expected-key)))
	(if return-socket
	    req
	    (begin
	      (if (not socket)(nn-close req)) ;; don't want a side effect of closing socket if handed it
216
217
218
219
220
221
222
223

224
225
226
227
228
229
230
216
217
218
219
220
221
222

223
224
225
226
227
228
229
230







-
+







			   (set! success #t)
			   (set! result (db:string->obj res transport: 'nmsg))))
		       "send-recv"))
	 (timeout     (make-thread
		       (lambda ()
			 (let loop ((count 0))
			   (thread-sleep! 1)
			   (debug:print-info 1 "send-receive-raw, still waiting after " count " seconds...")
			   (debug:print-info 1 #f "send-receive-raw, still waiting after " count " seconds...")
			   (if (and keepwaiting (< count timeout)) ;; yes, this is very aproximate
			       (loop (+ count 1))))
			 (if keepwaiting
			     (begin
			       (print "timeout waiting for ping")
			       (thread-terminate! send-recv))))
		       "timeout")))
238
239
240
241
242
243
244
245
246


247
248

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268

269
270
271
272
273
274
275
238
239
240
241
242
243
244


245
246
247

248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267

268
269
270
271
272
273
274
275







-
-
+
+

-
+



















-
+







     (if success (thread-terminate! timeout)))
    ;; raise timeout error if timed out
    (if success
	(if (and (vector? result)
		 (vector-ref result 0)) ;; did it fail at the server?
	    result                ;; nope, all good
	    (begin
	      (debug:print 0 "ERROR: error occured at server, info=" (vector-ref result 2))
	      (debug:print 0 " client call chain:")
	      (debug:print 0 #f "ERROR: error occured at server, info=" (vector-ref result 2))
	      (debug:print 0 #f " client call chain:")
	      (print-call-chain (current-error-port))
	      (debug:print 0 " server call chain:")
	      (debug:print 0 #f " server call chain:")
	      (pp (vector-ref result 1) (current-error-port))
	      (signal (vector-ref result 0))))
	(signal (make-composite-condition
		 (make-property-condition 'timeout 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))

;; run nmsg-transport:keep-running in a parallel thread to monitor that the db is being 
;; used and to shutdown after sometime if it is not.
;;
(define (nmsg-transport:keep-running server-id run-id)
  ;; if none running or if > 20 seconds since 
  ;; server last used then start shutdown
  ;; This thread waits for the server to come alive
  (let* ((server-info (let loop ()
                        (let ((sdat #f))
                          (mutex-lock! *heartbeat-mutex*)
                          (set! sdat *server-info*)
                          (mutex-unlock! *heartbeat-mutex*)
                          (if sdat 
			      (begin
				(debug:print-info 0 "keep-running got sdat=" sdat)
				(debug:print-info 0 #f "keep-running got sdat=" sdat)
				sdat)
                              (begin
                                (thread-sleep! 0.5)
                                (loop))))))
         (iface       (car server-info))
         (port        (cadr server-info))
         (last-access 0)
295
296
297
298
299
300
301
302

303
304
305

306
307
308
309

310
311
312
313
314
315
316
295
296
297
298
299
300
301

302
303
304

305
306
307
308

309
310
311
312
313
314
315
316







-
+


-
+



-
+







        (set! last-access *last-db-access*)
        (mutex-unlock! *heartbeat-mutex*)
	(db:sync-touched *inmemdb* run-id force-sync: #t)
        (if (and *server-run*
	       (> (+ last-access server-timeout)
		  (current-seconds)))
            (begin
              (debug:print-info 0 "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
              (debug:print-info 0 #f "Server continuing, seconds since last db access: " (- (current-seconds) last-access))
              (loop 0))
            (begin
              (debug:print-info 0 "Starting to shutdown the server.")
              (debug:print-info 0 #f "Starting to shutdown the server.")
              (set! *time-to-exit* #t)
	      (db:sync-touched *inmemdb* run-id force-sync: #t)
              (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id " http-transport:keep-running")
              (debug:print-info 0 "Server shutdown complete. Exiting")
              (debug:print-info 0 #f "Server shutdown complete. Exiting")
              (exit)
	      ))))))

;;======================================================================
;; C L I E N T S
;;======================================================================

337
338
339
340
341
342
343
344

345
346
347
348
349
350

351
352

353
354
355
356
357
358
337
338
339
340
341
342
343

344
345
346
347
348
349

350
351

352
353
354
355
356
357
358







-
+





-
+

-
+






;;======================================================================

;; DO NOT USE
;;
(define (nmsg-transport:client-signal-handler signum)
  (handle-exceptions
   exn
   (debug:print " ... exiting ...")
   (debug:print 0 #f " ... exiting ...")
   (let ((th1 (make-thread (lambda ()
			     (if (not *received-response*)
				 (receive-message* *runremote*))) ;; flush out last call if applicable
			   "eat response"))
	 (th2 (make-thread (lambda ()
			     (debug:print 0 "ERROR: Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.")
			     (debug:print 0 #f "ERROR: Received ^C, attempting clean exit. Please be patient and wait a few seconds before hitting ^C again.")
			     (thread-sleep! 3) ;; give the flush three seconds to do it's stuff
			     (debug:print 0 "       Done.")
			     (debug:print 0 #f "       Done.")
			     (exit 4))
			   "exit on ^C timer")))
     (thread-start! th2)
     (thread-start! th1)
     (thread-join! th2))))