Overview
Context
Changes
Modified client.scm
from [a2787b3361]
to [01b4fde796].
︙ | | |
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
|
-
+
+
|
(let ((num-available (tasks:bb-num-in-available-state run-id)))
(debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available)
(if (< num-available 2)
(server:try-running run-id))
(thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms.
(client:setup run-id remaining-tries: (- remaining-tries 1))))))
((http)(client:setup-http run-id server-dat remaining-tries))
;; ((rpc) (rpc-transport:client-setup run-id)) ;;(client:setup-rpc run-id)) rpc not implemented; want to see a failure here for now.
((rpc) (rpc-transport:client-setup run-id server-dat remtries: remaining-tries))
(else
(debug:print-error 0 *default-log-port* "(6) Transport ["
transport "] specified for run-id [" run-id "] is not implemented in client:setup. Cannot proceed.")
(exit 1)))))
;; client:setup-http
;;
;; For http transport, robustly ensure an advertised-running server is actually working and responding, and
;; establish tcp connection to server. For servers marked running but not responding, kill them and clear from mdb
;;
(define (client:setup-http run-id server-dat remaining-tries)
|
︙ | | |
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
|
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
-
-
+
+
|
(begin
(hash-table-set! *runremote* run-id start-res) ;; side-effect - *runremote* cache init fpr rmt:*
(debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res))
start-res)
(begin ;; login failed but have a server record, clean out the record and try again
(debug:print-info 0 *default-log-port* "client:setup-http, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat)
(http-transport:close-connections run-id)
(hash-table-delete! *runremote* run-id)
(tasks:kill-server-run-id run-id)
(hash-table-delete! *runremote* run-id) ;; BB: suspect there is nothing to delete ...
(tasks:kill-server-run-id run-id) ;; -9 so the hung processes dont eat 100% when not responding to sigterm.
(tasks:bb-server-force-clean-run-record run-id iface port
" client:setup-http (server-dat = #t)")
(if (> remaining-tries 8)
(thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
(thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
(server:try-running run-id)
(thread-sleep! 5) ;; give server a little time to start up
|
︙ | | |
Modified rpc-transport.scm
from [4a03110bb8]
to [0b43c472fb].
︙ | | |
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
|
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
|
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
|
#t)
(begin
(BB> "LOGIN_FAILED")
#f))
(BB> "self test res="res)
res));)
(define (rpc-transport:client-setup run-id #!key (remtries 10))
(define (rpc-transport:client-setup run-id server-dat #!key (remtries 10))
(if *runremote*
(begin
(debug:print-error 0 *default-log-port* "Attempt to connect to server but already connected")
#f)
(let* ((host-info (hash-table-ref/default *runremote* run-id #f))) ;; (open-run-close db:get-var #f "SERVER"))
(if host-info
(let ((iface (car host-info))
(port (cadr host-info))
(ping-res ((rpc:procedure 'server:login host port) *toppath*)))
(if ping-res
(let ((server-dat (list iface port #f #f #f)))
(hash-table-set! *runremote* run-id server-dat)
server-dat)
(tcp-buffer-size 0)
(begin
(server:try-running run-id)
(thread-sleep! 2)
(rpc-transport:client-setup run-id (- remtries 1)))))
(let* ((server-db-info (open-run-close tasks:get-server-info tasks:open-db run-id)))
(debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries)
(debug:print-info 0 *default-log-port* "rpc-transport:client-setup run-id="run-id" server-dat=" server-dat ", remaining-tries=" remtries)
(if server-db-info
(let* ((iface (tasks:hostinfo-get-interface server-db-info))
(port (tasks:hostinfo-get-port server-db-info))
(server-dat (list iface port #f #f #f))
(ping-res ((rpc:procedure 'server:login host port) *toppath*)))
(if start-res
(begin
(hash-table-set! *runremote* run-id server-dat)
server-dat)
(begin
(server:try-running run-id)
(thread-sleep! 2)
(rpc-transport:client-setup run-id (- remtries 1)))))
(let* ((iface (tasks:hostinfo-get-interface server-dat))
(hostname (tasks:hostinfo-get-hostname server-dat))
(port (tasks:hostinfo-get-port server-dat))
(runremote-server-dat (vector iface port #f #f #f (current-seconds) 'rpc)) ;; http version := (vector iface port api-uri api-url api-req (current-seconds) 'http )
(ping-res (retry-thunk (lambda () ;; make 3 attempts to ping.
((rpc:procedure 'server:login iface port) *toppath*))
retries: 3)))
;; we got here from rmt:get-connection-info on the condition that *runremote* has no entry for run-id...
(if ping-res
(begin
(debug:print-info 0 *default-log-port* "rpc-transport:client-setup CONNECTION ESTABLISHED run-id="run-id" server-dat=" server-dat)
(hash-table-set! *runremote* run-id runremote-server-dat) ;; side-effect - *runremote* cache init fpr rmt:*
runremote-server-dat)
(begin ;; login failed but have a server record, clean out the record and try again
(tasks:kill-server-run-id run-id)
(tasks:bb-server-force-clean-run-record run-id iface port
" rpc-transport:client-setup (server-dat = #t)")
(if (> remtries 2)
(thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little
(thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time
(begin
(server:try-running run-id)
(thread-sleep! 2)
(rpc-transport:client-setup run-id (- remtries 1)))))))))
(server:try-running run-id)
(thread-sleep! 5) ;; give server a little time to start up
(client:setup run-id remaining-tries: (sub1 remtries))
;;
;; (port (if (and hostinfo (> (length hostdat) 1))(cadr hostdat) #f)))
;; (if (and port
;; (string->number port))
;; (let ((portn (string->number port)))
;; (debug:print-info 2 *default-log-port* "Setting up to connect to host " host ":" port)
;; (handle-exceptions
;; exn
;; (begin
;; (debug:print-error 0 *default-log-port* "Failed to open a connection to the server at host: " host " port: " port)
;; (debug:print 0 *default-log-port* " EXCEPTION: " ((condition-property-accessor 'exn 'message) exn))
;; ;; (open-run-close
;; ;; (lambda (db . param)
;; ;; (sqlite3:execute db "DELETE FROM metadat WHERE var='SERVER'"))
;; ;; #f)
;; (set! *runremote* #f))
;; (if (and (not (args:get-arg "-server")) ;; no point in the server using the server using the server
;; ((rpc:procedure 'server:login host portn) *toppath*))
;; (begin
;; (debug:print-info 2 *default-log-port* "Logged in and connected to " host ":" port)
;; (set! *runremote* (vector host portn)))
;; (begin
;; (debug:print-info 2 *default-log-port* "Failed to login or connect to " host ":" port)
;; (set! *runremote* #f)))))
;; (debug:print-info 2 *default-log-port* "no server available")))))
" rpc-transport:client-setup (server-dat = #t)"))))
|
Modified tasks.scm
from [133d4feb50]
to [736f8afe48].
︙ | | |
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
|
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
|
-
+
|
(sdat (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)))
(if sdat
(let ((hostname (vector-ref sdat 6))
(pid (vector-ref sdat 5))
(server-id (vector-ref sdat 0)))
(tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed")
(debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid)
(tasks:kill-server hostname pid kill-switch: "-9") ;; BB: added -9, let's not be kind here. we need it to die
(tasks:kill-server hostname pid kill-switch: "-9") ;; BB: added -9, let's not be kind here. we need it to die so it isn't a 100% cpu zombie
(tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) )
(debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill"))
;; (sqlite3:finalize! tdb)
))
;;======================================================================
;; M O N I T O R S
|
︙ | | |