Overview
Comment: | Fixed multiple little issues with server log handshaking |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | server-log-handshaking |
Files: | files | file ages | folders |
SHA1: |
cc6a49cf2168887ba47d43ae9ee4e3a0 |
User & Date: | matt on 2017-01-28 18:46:33 |
Other Links: | branch diff | manifest | tags |
Context
2017-01-28
| ||
22:34 | Added timestamp touch on server logs, rock solid now with only one server being sustained. check-in: 2a497f95de user: matt tags: server-log-handshaking | |
21:09 | Automated merge of server-log-handshaking/cc6a49cf21/integ into integ-home check-in: eeae55dd41 user: matt tags: integ-home | |
18:46 | Fixed multiple little issues with server log handshaking check-in: cc6a49cf21 user: matt tags: server-log-handshaking | |
12:25 | Fixed couple missing changes needed for server log handshaking check-in: 5a64939577 user: matt tags: server-log-handshaking | |
Changes
Modified client.scm from [cbdbeb9ff4] to [f280cc01bf].
︙ | ︙ | |||
10 11 12 13 14 15 16 | ;;====================================================================== ;; C L I E N T S ;;====================================================================== (require-extension (srfi 18) extras tcp s11n) | | | 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | ;;====================================================================== ;; C L I E N T S ;;====================================================================== (require-extension (srfi 18) extras tcp s11n) (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest matchable) ;; (use zmq) (use (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars uri-common intarweb directory-utils) (declare (unit client)) |
︙ | ︙ | |||
46 47 48 49 50 51 52 | (define (client:connect iface port) (case (server:get-transport) ((rpc) (rpc:client-connect iface port)) ((http) (http:client-connect iface port)) ((zmq) (zmq:client-connect iface port)) (else (rpc:client-connect iface port)))) | | | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | > | | | | | > > > > | > > > > | > | < < < | | < < | | < < < < < < | | | | | | | | | | | < < < < < < < < < < | | | | < > | < < | > | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | (define (client:connect iface port) (case (server:get-transport) ((rpc) (rpc:client-connect iface port)) ((http) (http:client-connect iface port)) ((zmq) (zmq:client-connect iface port)) (else (rpc:client-connect iface port)))) (define (client:setup areapath #!key (remaining-tries 100) (failed-connects 0)) (case (server:get-transport) ((rpc) (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id)) ((http)(client:setup-http areapath remaining-tries: remaining-tries failed-connects: failed-connects)) (else (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id)))) ;; Do all the connection work, look up the transport type and set up the ;; connection if required. ;; ;; There are two scenarios. ;; 1. We are a test manager and we received *transport-type* and *runremote* via cmdline ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; (define (client:setup-http areapath #!key (remaining-tries 100) (failed-connects 0)) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) (server:start-and-wait areapath) (if (<= remaining-tries 0) (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server") (exit 1)) ;; ;; Alternatively here, we can get the list of candidate servers and work our way ;; through them searching for a good one. ;; (let* ((server-dat (server:get-first-best areapath))) (if (not server-dat) ;; no server found (client:setup-http areapath remaining-tries: (- remaining-tries 1)) (let ((host (cadr server-dat)) (port (caddr server-dat))) (debug:print-info 4 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) (if (not *runremote*)(set! *runremote* (make-remote))) (if (and host port) (let* ((start-res (case *transport-type* ((http)(http-transport:client-connect host port)))) (ping-res (case *transport-type* ((http)(rmt:login-no-auto-client-setup start-res))))) (if (and start-res ping-res) (begin (remote-conndat-set! *runremote* start-res) ;; (hash-table-set! *runremote* run-id start-res) (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 *default-log-port* "client:setup, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (case *transport-type* ((http)(http-transport:close-connections run-id))) (remote-conndat-set! *runremote* #f) ;; (hash-table-delete! *runremote* run-id) (thread-sleep! 1) (client:setup-http areapath remaining-tries: (- remaining-tries 1)) ))) (begin ;; no server registered (server:kind-run areapath) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries) (thread-sleep! 1) ;; (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (server:start-and-wait areapath) (client:setup-http areapath remaining-tries: (- remaining-tries 1))))))))) |
Modified dcommon.scm from [17ed94a823] to [8fb41f6f61].
︙ | ︙ | |||
660 661 662 663 664 665 666 | (begin (iup:attribute-set! servers-matrix row-col val) (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)))) (set! colnum (+ 1 colnum)))) vals) (set! rownum (+ rownum 1))) (iup:attribute-set! servers-matrix "REDRAW" "ALL"))) | | | 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 | (begin (iup:attribute-set! servers-matrix row-col val) (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)))) (set! colnum (+ 1 colnum)))) vals) (set! rownum (+ rownum 1))) (iup:attribute-set! servers-matrix "REDRAW" "ALL"))) (sort servers (lambda (a b)(> (car a)(car b)))))))))) (set! colnum 0) (for-each (lambda (colname) (iup:attribute-set! servers-matrix (conc "0:" colnum) colname) (iup:attribute-set! servers-matrix "FITTOTEXT" (conc "C" colnum)) (set! colnum (+ colnum 1))) colnames) ;; (set! dashboard:update-servers-table updater) |
︙ | ︙ |
Modified rmt.scm from [0b9221a215] to [6da9a206d9].
︙ | ︙ | |||
34 35 36 37 38 39 40 | ;; else return #f to let the calling proc know that there is no server available ;; (define (rmt:get-connection-info areapath) ;; TODO: push areapath down. (let ((cinfo (remote-conndat *runremote*)) (run-id 0)) (if cinfo cinfo | | | | 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | ;; else return #f to let the calling proc know that there is no server available ;; (define (rmt:get-connection-info areapath) ;; TODO: push areapath down. (let ((cinfo (remote-conndat *runremote*)) (run-id 0)) (if cinfo cinfo (if (server:check-if-running areapath) (client:setup areapath) #f)))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected |
︙ | ︙ | |||
91 92 93 94 95 96 97 | (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*) ;; have a server (not (server:check-if-running *toppath*))) ;; server has died. | | | 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | (rmt:open-qry-close-locally cmd 0 params)) ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url *runremote*) ;; have a server (not (server:check-if-running *toppath*))) ;; server has died. (set! *runremote* (make-remote)) (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; on homehost and this is a write, we already have a server ((and (cdr (remote-hh-dat *runremote*)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write |
︙ | ︙ | |||
121 122 123 124 125 126 127 | (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") (rmt:open-qry-close-locally cmd 0 params)) ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) | | | 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") (rmt:open-qry-close-locally cmd 0 params)) ((and (not (cdr (remote-hh-dat *runremote*))) ;; not on a homehost (not (remote-conndat *runremote*))) ;; and no connection (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat *runremote*) " conndat: " (remote-conndat *runremote*)) (mutex-unlock! *rmt-mutex*) (server:start-and-wait *toppath*) (remote-conndat-set! *runremote* (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as ;; all set up if get this far, dispatch the query ((cdr (remote-hh-dat *runremote*)) ;; we are on homehost (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) |
︙ | ︙ | |||
164 165 166 167 168 169 170 | (exit 1))) (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! *runremote* #f) (remote-server-url-set! *runremote* #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") (mutex-unlock! *rmt-mutex*) | | | 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | (exit 1))) (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! *runremote* #f) (remote-server-url-set! *runremote* #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") (mutex-unlock! *rmt-mutex*) (server:start-and-wait *toppath*) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions ;; exn ;; (begin |
︙ | ︙ |
Modified server.scm from [0aa5a0a335] to [f482096d94].
︙ | ︙ | |||
84 85 86 87 88 89 90 | (define (server:reply return-addr query-sig success/fail result) (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) ;; (send-message pubsock target send-more: #t) ;; (send-message pubsock (case (server:get-transport) ((rpc) (db:obj->string (vector success/fail query-sig result))) ((http) (db:obj->string (vector success/fail query-sig result))) | < < < < | 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | (define (server:reply return-addr query-sig success/fail result) (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) ;; (send-message pubsock target send-more: #t) ;; (send-message pubsock (case (server:get-transport) ((rpc) (db:obj->string (vector success/fail query-sig result))) ((http) (db:obj->string (vector success/fail query-sig result))) ((fs) result) (else (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) result))) ;; Given a run id start a server process ### NOTE ### > file 2>&1 ;; if the run-id is zero and the target-host is set |
︙ | ︙ | |||
130 131 132 133 134 135 136 | ;; match current ip or hostname (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) (not (equal? curr-ip target-host))) (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) | | | 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | ;; match current ip or hostname (not (string-match (conc "("curr-host "|" curr-host"\\..*)") target-host)) (not (equal? curr-ip target-host))) (begin (debug:print-info 0 *default-log-port* "Starting server on " target-host ", logfile is " logfile) (setenv "TARGETHOST" target-host))) (setenv "TARGETHOST_LOGF" logfile) (common:wait-for-normalized-load 4 " delaying server start due to load" remote-host: (get-environment-variable "TARGETHOST")) ;; do not try starting servers on an already overloaded machine, just wait forever (system (conc "nbfake " cmdln)) (unsetenv "TARGETHOST_LOGF") (if (get-environment-variable "TARGETHOST")(unsetenv "TARGETHOST")) (thread-join! log-rotate) (pop-directory))) |
︙ | ︙ | |||
218 219 220 221 222 223 224 225 226 227 228 229 230 231 | (< (- now mod-time) 10) ;; still alive - file touched in last 10 seconds (< (- now start-time) 3600) ;; under one hour running time ))) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) (conc host ":" port) #f))) | > > > > > > > | 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 | (< (- now mod-time) 10) ;; still alive - file touched in last 10 seconds (< (- now start-time) 3600) ;; under one hour running time ))) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) (define (server:get-first-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) (define (server:record->url servr) (match-let (((mod-time host port start-time pid) servr)) (if (and host port) (conc host ":" port) #f))) |
︙ | ︙ | |||
241 242 243 244 245 246 247 248 249 250 251 252 253 254 | (define (server:kind-run areapath) (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f))) (if (or (not last-run-time) (> (- (current-seconds) last-run-time) 30)) (begin (server:run areapath) (hash-table-set! *server-kind-run* areapath (current-seconds)))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. (define (server:dotserver-age-seconds areapath) (let ((server-file (conc areapath "/.server"))) (begin (handle-exceptions | > > > > > > > > > > > | 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 | (define (server:kind-run areapath) (let ((last-run-time (hash-table-ref/default *server-kind-run* areapath #f))) (if (or (not last-run-time) (> (- (current-seconds) last-run-time) 30)) (begin (server:run areapath) (hash-table-set! *server-kind-run* areapath (current-seconds)))))) (define (server:start-and-wait areapath #!key (timeout 60)) (let ((give-up-time (+ (current-seconds) timeout))) (let loop ((server-url (server:check-if-running areapath))) (if (or server-url (> (current-seconds) give-up-time)) server-url (begin (server:kind-run areapath) (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. (define (server:dotserver-age-seconds areapath) (let ((server-file (conc areapath "/.server"))) (begin (handle-exceptions |
︙ | ︙ | |||
263 264 265 266 267 268 269 | (let* ((servers (server:get-best (server:get-list areapath))) (best-server (if (null? servers) #f (car servers))) (dotserver-url (if best-server (server:record->url best-server) #f))) ;; (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) (if dotserver-url (let* ((res (case *transport-type* | | | | | | | | | | | | | | | > | | > | 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 | (let* ((servers (server:get-best (server:get-list areapath))) (best-server (if (null? servers) #f (car servers))) (dotserver-url (if best-server (server:record->url best-server) #f))) ;; (server:read-dotserver->url areapath))) ;; tdbdat (tasks:open-db))) (if dotserver-url (let* ((res (case *transport-type* ((http)(server:ping dotserver-url)) ;; ((nmsg)(nmsg-transport:ping (tasks:hostinfo-get-interface server) ))) (if res dotserver-url (begin ;; (server:kill best-server) #f))) #f))) (define (server:kill servr) (match-let (((mod-time hostname port start-time pid) servr)) (tasks:kill-server hostname pid))) ;; called in megatest.scm, host-port is string hostname:port ;; ;; NOTE: This is NOT called directly from clients as not all transports support a client running ;; in the same process as the server. ;; (define (server:ping host-port-in #!key (do-exit #f)) (let ((host:port (if (not host-port-in) ;; use read-dotserver to find #f ;; (server:check-if-running *toppath*) ;; (if (number? host-port-in) ;; we were handed a server-id ;; (let ((srec (tasks:get-server-by-id (db:delay-if-busy (tasks:open-db)) host-port-in))) ;; ;; (print "srec: " srec " host-port-in: " host-port-in) ;; (if srec ;; (conc (vector-ref srec 3) ":" (vector-ref srec 4)) ;; (conc "no such server-id " host-port-in))) host-port-in))) ;; ) (let* ((host-port (if host:port (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f)) #f))) ;; (toppath (launch:setup))) ;; (print "host-port=" host-port) (if (not host-port) (begin (if host-port-in (debug:print 0 *default-log-port* "ERROR: bad host:port")) (if do-exit (exit 1)) #f) (let* ((iface (car host-port)) (port (cadr host-port)) (server-dat (http-transport:client-connect iface port)) (login-res (rmt:login-no-auto-client-setup server-dat))) (if (and (list? login-res) (car login-res)) (begin ;; (print "LOGIN_OK") (if do-exit (exit 0)) #t) (begin ;; (print "LOGIN_FAILED") (if do-exit (exit 1)) #f))))))) ;; run ping in separate process, safest way in some cases ;; (define (server:ping-server ifaceport) (with-input-from-pipe (conc (common:get-megatest-exe) " -ping " ifaceport) (lambda () |
︙ | ︙ |