︙ | | |
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
|
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
|
-
-
+
+
|
(begin
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "dbprep")
(thread-sleep! 0.5) ;; give some margin for queries to complete before switching from file based access to server based access
(set! *dbstruct-db* (db:setup)) ;; run-id))
(set! server-going #t)
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "running")
(server:write-dotserver *toppath* (conc iface ":" port))
(delete-file* (conc *toppath* "/.starting-server")))
(begin ;; gotta exit nicely
(server:dotserver-starting-remove))
(begin ;; gotta exit nicely
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "collision")
(http-transport:server-shutdown server-id port))))))
;; when things go wrong we don't want to be doing the various queries too often
;; so we strive to run this stuff only every four seconds or so.
(let* ((sync-time (- (current-milliseconds) start-time))
(rem-time (quotient (- 4000 sync-time) 1000)))
|
︙ | | |
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
|
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
|
-
-
+
-
-
|
(exit)))
;; all routes though here end in exit ...
;;
;; start_server?
;;
(define (http-transport:launch run-id)
(with-output-to-file
(conc *toppath* "/.starting-server")
(server:dotserver-starting)
(lambda ()
(print (current-process-id) " on " (get-host-name))))
(let* ((tdbdat (tasks:open-db)))
(set! *run-id* run-id)
(if (args:get-arg "-daemonize")
(begin
(daemon:ize)
(if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
(begin
|
︙ | | |
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
|
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
|
-
+
+
|
(thread-sleep! 2)
(loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http)
(- remtries 1)))
(begin
;; since we didn't get the server lock we are going to clean up and bail out
(debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
(tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch")
(delete-file* (conc *toppath* "/.starting-server"))
(server:dotserver-starting-remove)
))
(let* ((th2 (make-thread (lambda ()
(debug:print-info 0 *default-log-port* "Server run thread started")
(http-transport:run
(if (args:get-arg "-server")
(args:get-arg "-server")
"-")
|
︙ | | |
︙ | | |
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
|
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
+
+
-
+
+
|
;; if not on homehost ensure we have a connection to a live server
;; NOTE: we *have* a homehost record by now
((and (not (cdr (remote-hh-dat *runremote*))) ;; are we on a homehost?
(not (remote-conndat *runremote*))) ;; and no connection
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*))
(mutex-unlock! *rmt-mutex*)
(tasks:start-and-wait-for-server (tasks:open-db) 0 15)
(let* ((cinfo (rmt:get-connection-info 0))
(transport (vector-ref cinfo 6))) ;; TODO: replace with tasks:server-dat-accessor-?? for transport
(remote-conndat-set! *runremote* (rmt:get-connection-info 0)) ;; calls client:setup which calls client:setup-http
(remote-conndat-set! *runremote* cinfo) ;; calls client:setup which calls client:setup-http
(remote-transport-set! *runremote* transport))
(rmt:send-receive cmd rid params attemptnum: attemptnum))
;; all set up if get this far, dispatch the query
((cdr (remote-hh-dat *runremote*)) ;; we are on homehost
(mutex-unlock! *rmt-mutex*)
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 7")
(rmt:open-qry-close-locally cmd (if rid rid 0) params))
;; not on homehost, do server query
|
︙ | | |
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
|
-
+
|
((commfail)(vector #f "communications fail"))
((exn)(vector #f "other fail" (print-call-chain)))))
((rpc) (condition-case ;; handling here has caused a lot of problems. However it is needed to deal with attemtped communication to servers that have gone away
(rpc-transport:client-api-send-receive 0 conninfo cmd params)
((commfail)(vector #f "communications fail"))
((exn)(vector #f "other fail" (print-call-chain)))))
(else
(debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported")
(debug:print 0 *default-log-port* "ERROR: transport " (remote-transport *runremote*) " not supported (1)")
(exit))))
(success (if (vector? dat) (vector-ref dat 0) #f))
(res (if (vector? dat) (vector-ref dat 1) #f)))
(if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time
(debug:print-info 12 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat)
(if success
(case (remote-transport *runremote*)
|
︙ | | |
︙ | | |
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
-
+
|
(begin
(apply (eval procsym) params))))
res))
;; rpc receiver
(define (rpc-transport:api-exec cmd params)
(let* ( (resdat (api:execute-requests *inmemdb* (vector cmd params))) ;; #( flag result )
(let* ( (resdat (api:execute-requests *dbstruct-db* (vector cmd params))) ;; #( flag result )
(flag (vector-ref resdat 0))
(res (vector-ref resdat 1)))
(mutex-lock! *heartbeat-mutex*)
(set! *last-db-access* (current-seconds)) ;; bump *last-db-access*; this will renew keep-running thread's lease on life for another (server:get-timeout) seconds
;;(BB> "in api-exec; last-db-access updated to "*last-db-access*)
|
︙ | | |
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
|
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
|
-
-
-
+
+
+
|
;;(tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "stopped")
;; TODO: (low) the following is extraordinaritly slow. Maybe we don't even need portlogger for rpc anyway?? the exception-based failover when ports are taken is fast!
;;(portlogger:open-run-close portlogger:set-port (rpc:default-server-port) "released")
(set! *time-to-exit* #t)
;;(if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t))
;;(if *dbstruct-db* (db:sync-touched *dbstruct-db* *run-id* force-sync: #t))
(server:remove-dotserver-file *toppath* "anyhost:anyport" force: #t)
(tasks:server-delete-record (db:delay-if-busy (tasks:open-db)) server-id " rpc-transport:keep-running complete")
;;(BB> "Before (exit) (from-on-exit="from-on-exit")")
;;(unless from-on-exit (exit)) ;; sometimes we hang (around) here with 100% cpu.
;;(BB> "After")
;; strace reveals endless:
|
︙ | | |
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
|
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
|
+
-
+
-
+
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
+
+
+
-
-
+
+
+
+
|
;; (when (args:get-arg "-daemonize")
;; (daemon:ize)
;; (when *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it
;; (current-error-port *alt-log-file*)
;; (current-output-port *alt-log-file*)))
;; double check we dont alrady have a running server for this run-id
(when (and (server:read-dotserver *toppath*)
(when (server:check-if-running run-id)
(server:check-if-running run-id))
(debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running")
(exit 0))
;; did not find server running, let's clean up the table of dead servers
;; clean up dead servers (duped in megatest.scm in -list-servers processing; may want to consolidate into proc)
(for-each
(tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy (tasks:open-db)) run-id "notresponding")
(lambda (server)
(let* ((id (vector-ref server 0))
(pid (vector-ref server 1))
(hostname (vector-ref server 2))
(interface (vector-ref server 3))
(pullport (vector-ref server 4))
(pubport (vector-ref server 5))
(start-time (vector-ref server 6))
(priority (vector-ref server 7))
(state (vector-ref server 8))
(mt-ver (vector-ref server 9))
(last-update (vector-ref server 10))
(transport (vector-ref server 11))
(killed #f)
(status (< last-update 20)))
(if (equal? state "dead")
(if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day.
(tasks:server-deregister (db:delay-if-busy (tasks:open-db)) hostname pullport: pullport pid: pid action: 'delete))
(if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds
(tasks:server-deregister (db:delay-if-busy (tasks:open-db)) hostname pullport: pullport pid: pid)))
;;(format #t fmtstr id mt-ver pid hostname (conc interface ":" pullport) pubport last-update
;; (if status "alive" "dead") transport)
;; (if (or (equal? id sid)
;; (equal? sid 0)) ;; kill all/any
;; (begin
;; (debug:print-info 0 *default-log-port* "Attempting to kill "kill-switch" server with pid " pid)
;; (tasks:kill-server hostname pid kill-switch: kill-switch)))
(server:dotserver-starting)
)
)
(tasks:get-all-servers (db:delay-if-busy (tasks:open-db))))
;; let's get a server-id for this server
;; if at first we do not suceed, try 3 more times.
(let ((server-id (retry-thunk
(lambda () (tasks:server-lock-slot (db:delay-if-busy (tasks:open-db)) run-id 'rpc))
chatty: #f
final-failure-returns-actual: #t
retries: 4)))
(when (not server-id) ;; dang we couldn't get a server-id.
;; since we didn't get the server lock we are going to clean up and bail out
(debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue")
(tasks:server-delete-records-for-this-pid (db:delay-if-busy (tasks:open-db)) " rpc-transport:launch")
(server:dotserver-starting-remove)
(exit 1))
;; we got a server-id (and a corresponding entry in servers table in globally shared mdb)
;; all systems go. Proceed to setup rpc server.
(rpc-transport:run
(if (args:get-arg "-server")
(args:get-arg "-server")
|
︙ | | |
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
|
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
|
+
-
+
+
+
+
+
+
+
+
+
+
-
+
-
+
-
-
-
+
+
-
-
-
+
+
+
-
+
+
-
-
+
+
+
+
+
+
+
-
-
+
+
+
-
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; rpc:listener is the tcp-listen result from inside the find-free-port-and-open complex.
;; It is our handle on the listening tcp port
;; We will attach this to our rpc server with rpc:make-server in thread th1 .
(rpc:listener (rpc-transport:find-free-port-and-open start-port))
(th1 (make-thread
(lambda ()
;;(BB> "BEFORE rpc:make-server")
((rpc:make-server rpc:listener) #t) )
((rpc:make-server rpc:listener) #t)
;;(BB> "BEFORE rpc:make-server")
)
"rpc:server"))
(hostname (if (string=? "-" hostn)
(get-host-name)
hostn))
(ipaddrstr (if (string=? "-" hostn)
(server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".")
(string-intersperse
(map number->string
(u8vector->list
(hostname->ip hostn))) ".")
))
(portnum (let ((res (rpc:default-server-port))) res))
(host:port (conc (if ipaddrstr ipaddrstr hostname) ":" portnum)))
(when (not (equal? ipaddrstr (server:get-best-guess-address (get-host-name))))
(debug:print 0 *default-log-port* "Error: This host "(ip->string (hostname->ip (get-host-name)))" ("(get-host-name)") is not the homehost "ipaddrstr" ("(ip->hostname (string->ip ipaddrstr))"; Cannot proceed.")
(server:dotserver-starting-remove)
(tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port
(exit))
(tasks:server-set-interface-port (db:delay-if-busy (tasks:open-db)) server-id ipaddrstr portnum)
;;============================================================
;; activate thread th1 to attach opened tcp port to rpc server
;;=============================================================
(thread-start! th1)
(set! db *inmemdb*)
(set! db *dbstruct-db*)
(debug:print 0 *default-log-port* "Server started on " host:port)
;;(BB> "before SELF-TEST")
;; (thread-sleep! 5)
(if (retry-thunk (lambda ()
(rpc-transport:self-test run-id ipaddrstr portnum))
final-failure-returns-actual: #t
final-failure-returns-actual: #t ;; TODO: remove this line
)
(debug:print 0 *default-log-port* "INFO: rpc self test passed!")
(begin
(debug:print 0 *default-log-port* "Error: rpc listener did not pass self test. Shutting down. On: " host:port)
(tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "dead")
(tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port
(rpc-transport:server-shutdown server-id rpc:listener)
(server:dotserver-starting-remove)
(exit)))
(mutex-lock! *heartbeat-mutex*)
(set! *last-db-access* (current-seconds))
(mutex-unlock! *heartbeat-mutex*)
;;(on-exit (lambda ()
;; (rpc-transport:server-shutdown server-id rpc:listener from-on-exit: #t)))
;; check again for running servers for this run-id in case one has snuck in since we checked last in rpc-transport:launch
(if (not (equal? server-id (tasks:server-am-i-the-server? (db:delay-if-busy (tasks:open-db)) run-id)));; try to ensure no double registering of servers
(begin ;; i am not the server, another server snuck in and beat this one to the punch
(tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port
(tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "collision"))
(tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "collision")
(server:dotserver-starting-remove))
(begin ;; i am the server
;; setup the in-memory db
(set! *inmemdb* (db:setup run-id))
(db:get-db *inmemdb* run-id)
(set! *dbstruct-db* (db:setup run-id))
(db:get-db *dbstruct-db* run-id)
;; at this point, satisfied server has started
;; let's make it official
(server:write-dotserver *toppath* (conc ipaddrstr ":" portnum))
(mutex-lock! *heartbeat-mutex*)
(set! *last-db-access* (current-seconds))
(mutex-unlock! *heartbeat-mutex*)
(set! *rpc:listener* rpc:listener)
(tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "running") ;; update our mdb servers entry
;; this let loop will hold open this thread until we want the server to shut down.
;; if no requests received within the last 20 seconds :
;; database hasnt changed in ??
;;
;; begin new loop
;; keep-running loop: polls last-db-access to see if we have timed out.
;; keep-running loop: polls last-db-access to see if we have timed out. keep running if not.
(let loop ((count 0)
(bad-sync-count 0))
(BB> "keep running: count = "count)
;; Use this opportunity to sync the inmemdb to db
(let ((start-time (current-milliseconds))
(sync-time #f)
(rem-time #f))
;; following is now done in common:watchdog, commenting out. sync-time will now be 0; can live with that.
;; inmemdb is a dbstruct
(condition-case
(db:sync-touched *inmemdb* *run-id* force-sync: #t)
((sync-failed)(cond
((> bad-sync-count 10) ;; time to give up
(rpc-transport:server-shutdown server-id rpc:listener))
(else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop
(thread-sleep! 5)
(loop count (+ bad-sync-count 1)))))
((exn)
(debug:print-error 0 *default-log-port* "error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server")
(rpc-transport:server-shutdown server-id rpc:listener)))
;; ;; inmemddb is a dbstruct
;; (condition-case
;; (db:sync-touched *dbstruct-db* *run-id* force-sync: #t)
;; ((sync-failed)(cond
;; ((> bad-sync-count 10) ;; time to give up
;; (rpc-transport:server-shutdown server-id rpc:listener))
;; (else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop
;; (thread-sleep! 5)
;; (loop count (+ bad-sync-count 1)))))
;; ((exn)
;; (debug:print-error 0 *default-log-port* "error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server ")
;; (rpc-transport:server-shutdown server-id rpc:listener)))
(set! sync-time (- (current-milliseconds) start-time))
(set! rem-time (quotient (- 4000 sync-time) 1000))
(debug:print 4 *default-log-port* "SYNC: time= " sync-time ", rem-time=" rem-time)
(if (and (<= rem-time 4)
(> rem-time 0))
(thread-sleep! rem-time)
|
︙ | | |
554
555
556
557
558
559
560
561
562
563
564
565
566
567
|
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
|
+
|
;;
(if (tasks:server-am-i-the-server? (db:delay-if-busy (tasks:open-db)) run-id)
(tasks:server-set-state! (db:delay-if-busy (tasks:open-db)) server-id "running"))
;;
(loop 0 bad-sync-count))
(begin
;;(BB> "SERVER SHUTDOWN CALLED! last-access="last-access" current-seconds="(current-seconds)" server-timeout="server-timeout)
(rpc-transport:server-shutdown server-id rpc:listener)))))
;; end new loop
))))
(define (rpc-transport:find-free-port-and-open port #!key )
(handle-exceptions
|
︙ | | |
︙ | | |
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
|
-
+
|
;;
(define (server:launch run-id transport-type-raw)
(let ((transport-type
(cond
((string? transport-type-raw) (string->symbol transport-type-raw))
(else transport-type-raw))))
(BB> "server:launch fired for run-id="run-id" transport-type="transport-type)
;;(BB> "server:launch fired for run-id="run-id" transport-type="transport-type)
(case transport-type
((http)(http-transport:launch run-id))
;;((nmsg)(nmsg-transport:launch run-id))
((rpc) (rpc-transport:launch run-id))
(else (debug:print-error 0 *default-log-port* "unknown server type " transport-type)))))
|
︙ | | |
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
|
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
|
+
+
+
+
+
+
+
+
+
+
+
+
-
+
-
+
|
(file-read-access? dotfile))
(with-input-from-file
dotfile
(lambda ()
(read-line)))
#f))))
(define (server:dotserver-starting)
(with-output-to-file
(conc *toppath* "/.starting-server")
(lambda ()
(print (current-process-id) " on " (get-host-name)))))
(define (server:dotserver-starting-remove)
(delete-file* (conc *toppath* "/.starting-server")))
;; write a .server file in *toppath* with hostport
;; return #t on success, #f otherwise
;;
(define (server:write-dotserver areapath hostport)
(let ((lock-file (conc areapath "/.server.lock"))
(server-file (conc areapath "/.server")))
(if (common:simple-file-lock lock-file)
(let ((res (handle-exceptions
exn
#f ;; failed for some reason, for the moment simply return #f
(with-output-to-file server-file
(lambda ()
(print hostport)))
#t)))
(debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " created")
(common:simple-file-release-lock lock-file)
res)
#f)))
(define (server:remove-dotserver-file areapath hostport)
(define (server:remove-dotserver-file areapath hostport #!key (force #f))
(let ((dotserver (server:read-dotserver areapath))
(server-file (conc areapath "/.server"))
(lock-file (conc areapath "/.server.lock")))
(if (and dotserver (string-match (conc ".*:" hostport "$") dotserver)) ;; port matches, good enough info to decide to remove the file
(if (or force (and dotserver (string-match (conc ".*:" hostport "$") dotserver))) ;; port matches, good enough info to decide to remove the file
(if (common:simple-file-lock lock-file)
(begin
(handle-exceptions
exn
#f
(delete-file* server-file))
(debug:print-info 0 *default-log-port* "server file " server-file " for " hostport " removed")
|
︙ | | |