Changes In Branch rpc-transport Excluding Merge-Ins
This is equivalent to a diff from 3e767a9aad to 4e3d7aed7d
2016-11-30
| ||
17:01 | Filter working check-in: d9859999af user: ritikaag tags: db-new | |
2016-11-18
| ||
20:46 | Try tmp db without rpc check-in: d06a3ab427 user: matt tags: v1.62-no-rpc | |
2016-11-17
| ||
16:27 | Beginnings of fix for testconfig disks issue Closed-Leaf check-in: 7e67a7638f user: mrwellan tags: testconfig-disks-fix | |
2016-11-16
| ||
16:57 | moved rpc-transport updates into mainline v1.62 branch check-in: f736d3db6e user: bjbarcla tags: v1.62 | |
16:19 | caught up to v1.62 Closed-Leaf check-in: 4e3d7aed7d user: bjbarcla tags: rpc-transport | |
16:08 | Merged v1.62 into rpc-transport Closed-Leaf check-in: 534875ccf1 user: mrwellan tags: rpc-transport-merge-v1.62 | |
16:06 | did some cleanup check-in: acd56658eb user: bjbarcla tags: rpc-transport | |
13:48 | Try using md5sum instead of sha1. Much faster but what is the collison risk? check-in: 3e767a9aad user: mrwellan tags: v1.62, v1.6208 | |
10:12 | Fixed remotediff example. Broken by unknown goof up. check-in: 9833288949 user: mrwellan tags: v1.62 | |
Modified client.scm from [b597605018] to [4d295b692f].
︙ | ︙ | |||
39 40 41 42 43 44 45 | ;; Not currently used! But, I think it *should* be used!!! (define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) | > > | | | | | | | > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | | | 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | ;; Not currently used! But, I think it *should* be used!!! (define (client:logout serverdat) (let ((ok (and (socket? serverdat) (cdb:logout serverdat *toppath* (client:get-signature))))) ok)) ;; BB: commenting out orphan code ;;;;; ;; (define (client:connect iface port) ;; (case (server:get-transport) ;; ((rpc) (rpc:client-connect iface port)) ;; ((http) (http:client-connect iface port)) ;; ((zmq) (zmq:client-connect iface port)) ;; (else (rpc:client-connect iface port)))) (define (client:setup run-id #!key (remaining-tries 10)) ;;(BB> "Entered client:setup with run-id="run-id" and remaining-tries="remaining-tries) (debug:print-info 2 *default-log-port* "client:setup remaining-tries=" remaining-tries) (let* ((server-dat (tasks:bb-get-server-info run-id)) (transport (if (and server-dat (vector? server-dat)) (string->symbol (tasks:hostinfo-get-transport server-dat)) 'noserver))) (case transport ((noserver) ;; no server registered (BB> "noserver") (if (<= remaining-tries 0) (begin (debug:print-error 0 *default-log-port* "failed to start or connect to server for run-id " run-id) (exit 1)) (begin (let ((num-available (tasks:bb-num-in-available-state run-id))) (debug:print-info 0 *default-log-port* "client:setup, no server registered, remaining-tries=" remaining-tries " num-available=" num-available) (if (< num-available 2) (server:try-running run-id)) (thread-sleep! (+ 5 (random (- 20 remaining-tries)))) ;; give server a little time to start up, randomize a little to avoid start storms. (client:setup run-id remaining-tries: (- remaining-tries 1)))))) ((http) (client:setup-http run-id server-dat remaining-tries)) ((rpc) (rpc-transport:client-setup run-id server-dat remtries: remaining-tries)) (else (debug:print-error 0 *default-log-port* "(6) Transport [" transport "] specified for run-id [" run-id "] is not implemented in client:setup. Cannot proceed.") (exit 1))))) ;; client:setup-http ;; ;; For http transport, robustly ensure an advertised-running server is actually working and responding, and ;; establish tcp connection to server. For servers marked running but not responding, kill them and clear from mdb ;; (define (client:setup-http run-id server-dat remaining-tries) (let* ((iface (tasks:hostinfo-get-interface server-dat)) (hostname (tasks:hostinfo-get-hostname server-dat)) (port (tasks:hostinfo-get-port server-dat)) (start-res (http-transport:client-connect iface port)) (ping-res (rmt:login-no-auto-client-setup start-res run-id))) (if (and start-res ping-res) (begin (rmt:set-cinfo run-id start-res) ;; (hash-table-set! *runremote* run-id start-res) ;; side-effect - *runremote* cache init fpr rmt:* (debug:print-info 2 *default-log-port* "connected to " (http-transport:server-dat-make-url start-res)) start-res) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 *default-log-port* "client:setup-http, login failed, will attempt to start server ... start-res=" start-res ", run-id=" run-id ", server-dat=" server-dat) (http-transport:close-connections run-id) (rmt:del-cinfo run-id) ;; (hash-table-delete! *runremote* run-id) ;; BB: suspect there is nothing to delete ... (tasks:kill-server-run-id run-id) ;; -9 so the hung processes dont eat 100% when not responding to sigterm. (tasks:bb-server-force-clean-run-record run-id iface port " client:setup-http (server-dat = #t)") (if (> remaining-tries 8) (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (- remaining-tries 1)) )))) ;; ((rpc) (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects)) ;;(client:setup-rpc run-id)) ;; ((http)(client:setup-http run-id remaining-tries: remaining-tries failed-connects: failed-connects)) ;; (else (rpc-transport:client-setup run-id remaining-tries: remaining-tries failed-connects: failed-connects)))) ;; (client:setup-rpc run-id)))) ;; ;; (define (client:login-no-auto-setup server-info run-id) ;; (case (server:get-transport) ;; ((rpc) (rpc:login-no-auto-client-setup server-info run-id)) ;; ((http) (rmt:login-no-auto-client-setup server-info run-id)) ;; (else (rpc:login-no-auto-client-setup server-info run-id)))) ;; ;; (define (client:setup-rpc run-id) |
︙ | ︙ | |||
93 94 95 96 97 98 99 | ;; (thread-sleep! 5) ;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) ;; (begin ;; (debug:print 25 *default-log-port* "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) ;; (thread-sleep! 5) ;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) ;; ;; YUK: rename server-dat here | | | 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | ;; (thread-sleep! 5) ;; (client:setup run-id remaining-tries: 10)) ;; (- remaining-tries 1))) ;; (begin ;; (debug:print 25 *default-log-port* "INFO: client:setup failed to connect, start-res=" start-res ", run-id=" run-id ", host-info=" host-info) ;; (thread-sleep! 5) ;; (client:setup run-id remaining-tries: (- remaining-tries 1)))))) ;; ;; YUK: rename server-dat here ;; (let* ((server-dat (open-run-close tasks:get-server-info tasks:open-db run-id))) ;; (debug:print-info 0 *default-log-port* "client:setup server-dat=" server-dat ", remaining-tries=" remaining-tries) ;; (if server-dat ;; (let* ((iface (tasks:hostinfo-get-interface server-dat)) ;; (port (tasks:hostinfo-get-port server-dat)) ;; (start-res (http-transport:client-connect iface port)) ;; ;; (ping-res (server:ping-server run-id iface port)) ;; (ping-res (rmt:login-no-auto-client-setup start-res run-id))) |
︙ | ︙ | |||
150 151 152 153 154 155 156 | ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; | < < < < < < < < < < < < < < < < < < < < < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | | | | | 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | ;; 2. We are a run tests, list runs or other interactive process and we must figure out ;; *transport-type* and *runremote* from the monitor.db ;; ;; client:setup ;; ;; lookup_server, need to remove *runremote* stuff ;; ;; BB: commenting out orphan code. ;; ;; ;; keep this as a function to ease future ;; (define (client:start run-id server-info) ;; (http-transport:client-connect (tasks:hostinfo-get-interface server-info) ;; (tasks:hostinfo-get-port server-info))) ;; ;; client:signal-handler ;; (define (client:signal-handler signum) ;; (signal-mask! signum) ;; (set! *time-to-exit* #t) ;; (handle-exceptions ;; exn |
︙ | ︙ |
Modified common.scm from [41eb86f112] to [2c23c31658].
︙ | ︙ | |||
97 98 99 100 101 102 103 | (define *inmemdb* #f) (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) ;; SERVER (define *my-client-signature* #f) | > > > | | | 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 | (define *inmemdb* #f) (define *task-db* #f) ;; (vector db path-to-db) (define *db-access-allowed* #t) ;; flag to allow access (define *db-access-mutex* (make-mutex)) ;; SERVER (define *my-client-signature* #f) ;; default preference for transport-type is set here ;; (define *transport-type* 'rpc) ;; override with [server] transport http|rpc (define *runremote* (make-hash-table)) ;; if set up for server communication this will hold <host port> (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *client-non-blocking-mode* #f) (define *server-id* #f) (define *server-info* #f) (define *time-to-exit* #f) |
︙ | ︙ |
Modified ezsteps.scm from [e22bde8ecd] to [01f2e721c5].
︙ | ︙ | |||
162 163 164 165 166 167 168 | (debug:print-info 2 *default-log-port* "Test NOT logged as COMPLETED, (state=" (db:test-get-state testinfo) "), updating result, rollup-status is " rollup-status) (tests:test-set-status! test-id new-state new-status (args:get-arg "-m") #f) ;; need to update the top test record if PASS or FAIL and this is a subtest (if (not (equal? item-path "")) | | | 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | (debug:print-info 2 *default-log-port* "Test NOT logged as COMPLETED, (state=" (db:test-get-state testinfo) "), updating result, rollup-status is " rollup-status) (tests:test-set-status! test-id new-state new-status (args:get-arg "-m") #f) ;; need to update the top test record if PASS or FAIL and this is a subtest (if (not (equal? item-path "")) (cdb:roll-up-pass-fail-counts *runremote* run-id test-name item-path new-status)))) ;; BB> I think this is dead code. cdb:roll-up-pass-fail-counts does not exist anywhere. ;; for automated creation of the rollup html file this is a good place... (if (not (equal? item-path "")) (tests:summarize-items #f run-id test-id test-name #f)) ;; don't force - just update if no ))) (pop-directory) rollup-status)) |
Modified fs-transport.scm from [59920959a9] to [de64f5b4d6].
︙ | ︙ | |||
11 12 13 14 15 16 17 | (require-extension (srfi 18) extras tcp s11n) (use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) (import (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars) | | | 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | (require-extension (srfi 18) extras tcp s11n) (use sqlite3 srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) (import (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars) ;;(tcp-buffer-size 2048) (declare (unit fs-transport)) (declare (uses common)) (declare (uses db)) (declare (uses tests)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. |
︙ | ︙ |
Modified http-transport.scm from [13883e3b0d] to [eb8d6f211b].
︙ | ︙ | |||
12 13 14 15 16 17 18 | (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) ;; sqlite3 ;; (import (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing) ;; Configurations for server | > | > > > | 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | (use srfi-1 posix regex regex-case srfi-69 hostinfo md5 message-digest) ;; sqlite3 ;; (import (prefix sqlite3 sqlite3:)) (use spiffy uri-common intarweb http-client spiffy-request-vars intarweb spiffy-directory-listing) ;; Configurations for server (tcp-buffer-size 2048) ;; this interferes with rpc ; compensating in rpc-transport... so far so good (max-connections 2048) (declare (unit http-transport)) (declare (uses common)) (declare (uses db)) (declare (uses tests)) |
︙ | ︙ | |||
259 260 261 262 263 264 265 | (db:string->obj (handle-exceptions exn (begin (set! success #f) (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) | | | 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 | (db:string->obj (handle-exceptions exn (begin (set! success #f) (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (rmt:del-cinfo run-id) ;;(hash-table-delete! *runremote* run-id) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? (mutex-unlock! *http-mutex*) ;;; (signal (make-composite-condition ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) ;;; "communications failed" (db:obj->string #f)) |
︙ | ︙ | |||
306 307 308 309 310 311 312 | (make-property-condition 'timeout 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) ;; careful closing of connections stored in *runremote* ;; (define (http-transport:close-connections run-id) | | | > | | 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 | (make-property-condition 'timeout 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server"))))))) ;; careful closing of connections stored in *runremote* ;; (define (http-transport:close-connections run-id) (let* ((server-dat (rmt:get-connection-info run-id))) ;;(hash-table-ref/default *runremote* run-id #f))) (if (vector? server-dat) (let ((api-dat (http-transport:server-dat-get-api-uri server-dat))) (close-connection! api-dat) #t) #f))) (define (make-http-transport:server-dat)(make-vector 6)) (define (http-transport:server-dat-get-iface vec) (vector-ref vec 0)) (define (http-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (http-transport:server-dat-get-api-uri vec) (vector-ref vec 2)) (define (http-transport:server-dat-get-api-url vec) (vector-ref vec 3)) (define (http-transport:server-dat-get-api-req vec) (vector-ref vec 4)) (define (http-transport:server-dat-get-last-access vec) (vector-ref vec 5)) (define (http-transport:server-dat-get-transport vec) (vector-ref vec 6)) (define (http-transport:server-dat-make-url vec) (if (and (http-transport:server-dat-get-iface vec) (http-transport:server-dat-get-port vec)) (conc "http://" (http-transport:server-dat-get-iface vec) ":" (http-transport:server-dat-get-port vec)) #f)) (define (http-transport:server-dat-update-last-access vec) ;;(BB> "entered http-transport:server-dat-update-last-access vec="vec) (if (vector? vec) (vector-set! vec 5 (current-seconds)) (begin (print-call-chain (current-error-port)) (debug:print-error 0 *default-log-port* "call to http-transport:server-dat-update-last-access with non-vector!!")))) ;; ;; connect ;; (define (http-transport:client-connect iface port) (let* ((api-url (conc "http://" iface ":" port "/api")) (api-uri (uri-reference (conc "http://" iface ":" port "/api"))) (api-req (make-request method: 'POST uri: api-uri)) (server-dat (vector iface port api-uri api-url api-req (current-seconds) 'http))) server-dat)) ;; run http-transport:keep-running in a parallel thread to monitor that the db is being ;; used and to shutdown after sometime if it is not. ;; (define (http-transport:keep-running server-id run-id) ;; if none running or if > 20 seconds since |
︙ | ︙ | |||
383 384 385 386 387 388 389 | (begin (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server " server-id " for run " run-id) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature") (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) | | | 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 | (begin (debug:print-error 0 *default-log-port* "transport appears to have died, exiting server " server-id " for run " run-id) (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id "failed to start, never received server alive signature") (exit)) (loop start-time (equal? sdat last-sdat) sdat))))))) (iface (car server-info)) ;; BB> this represents ip address, not interface (like eth0 as I would expect from the term) (port (cadr server-info)) (last-access 0) (server-timeout (server:get-timeout))) (let loop ((count 0) (server-state 'available) (bad-sync-count 0)) |
︙ | ︙ | |||
533 534 535 536 537 538 539 | (begin (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0))) | | | | 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 | (begin (current-error-port *alt-log-file*) (current-output-port *alt-log-file*))))) (if (server:check-if-running run-id) (begin (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0))) (let loop ((server-id (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http)) (remtries 4)) (if (not server-id) (if (> remtries 0) (begin (thread-sleep! 2) (loop (tasks:server-lock-slot (db:delay-if-busy tdbdat) run-id 'http) (- remtries 1))) (begin ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:server-delete-records-for-this-pid (db:delay-if-busy tdbdat) " http-transport:launch") )) (let* ((th2 (make-thread (lambda () |
︙ | ︙ |
Modified launch.scm from [53f264e03f] to [2803cc22f9].
︙ | ︙ | |||
66 67 68 69 70 71 72 | (if (file-exists? cname) (let* ((dat (read-config cname #f #f)) (csvr (db:logpro-dat->csv dat stepname)) (csvt (let-values (( (fmt-cell fmt-record fmt-csv) (make-format ","))) (fmt-csv (map list->csv-record csvr)))) (status (configf:lookup dat "final" "exit-status")) (msg (configf:lookup dat "final" "message"))) | < | < < | 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | (if (file-exists? cname) (let* ((dat (read-config cname #f #f)) (csvr (db:logpro-dat->csv dat stepname)) (csvt (let-values (( (fmt-cell fmt-record fmt-csv) (make-format ","))) (fmt-csv (map list->csv-record csvr)))) (status (configf:lookup dat "final" "exit-status")) (msg (configf:lookup dat "final" "message"))) (rmt:csv->test-data run-id test-id csvt) (cond ((equal? status "PASS") "PASS") ;; skip the message part if status is pass (status (conc (configf:lookup dat "final" "exit-status") ": " (if msg msg "no message"))) (else #f))) #f))) (define (launch:runstep ezstep run-id test-id exit-info m tal testconfig) |
︙ | ︙ | |||
698 699 700 701 702 703 704 | ;; megatest.config (cache if all vars avail) ;; returns: ;; *toppath* ;; side effects: ;; sets; *configdat* (megatest.config info) ;; *runconfigdat* (runconfigs.config info) ;; *configstatus* (status of the read data) | | | 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 | ;; megatest.config (cache if all vars avail) ;; returns: ;; *toppath* ;; side effects: ;; sets; *configdat* (megatest.config info) ;; *runconfigdat* (runconfigs.config info) ;; *configstatus* (status of the read data) ;; *transport-type* (define (launch:setup #!key (force #f)) (let* ((toppath (or *toppath* (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath (runname (common:args-get-runname)) (target (common:args-get-target)) (linktree (common:get-linktree)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config |
︙ | ︙ | |||
832 833 834 835 836 837 838 839 840 841 842 843 844 845 | (if (and *toppath* (directory-exists? *toppath*)) (begin (setenv "MT_RUN_AREA_HOME" *toppath*) (setenv "MT_TESTSUITE_NAME" (common:get-testsuite-name))) (begin (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area."))) *toppath*)) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) (string->number (or m "10000"))))) | > | 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 | (if (and *toppath* (directory-exists? *toppath*)) (begin (setenv "MT_RUN_AREA_HOME" *toppath*) (setenv "MT_TESTSUITE_NAME" (common:get-testsuite-name))) (begin (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area."))) (server:set-transport) *toppath*)) (define (get-best-disk confdat testconfig) (let* ((disks (or (and testconfig (hash-table-ref/default testconfig "disks" #f)) (hash-table-ref/default confdat "disks" #f))) (minspace (let ((m (configf:lookup confdat "setup" "minspace"))) (string->number (or m "10000"))))) |
︙ | ︙ | |||
1133 1134 1135 1136 1137 1138 1139 | (create-directory work-area #t) (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run"))) (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) | | | 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 | (create-directory work-area #t) (debug:print 0 *default-log-port* "WARNING: No disk work area specified - running in the test directory under tmp_run"))) (set! cmdparms (base64:base64-encode (z3:encode-buffer (with-output-to-string (lambda () ;; (list 'hosts hosts) (write (list (list 'testpath test-path) (list 'transport (conc (rmt:run-id->transport-type run-id))) ;; (list 'serverinf *server-info*) (list 'toppath *toppath*) (list 'work-area work-area) (list 'test-name test-name) (list 'runscript runscript) (list 'run-id run-id ) (list 'test-id test-id ) |
︙ | ︙ |
Modified megatest.scm from [53f98c25e7] to [c7d404dd53].
︙ | ︙ | |||
733 734 735 736 737 738 739 | ;; we start the server if not running else start the client thread ;;====================================================================== (if (args:get-arg "-server") ;; Server? Start up here. ;; | | | | | | | | | 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 | ;; we start the server if not running else start the client thread ;;====================================================================== (if (args:get-arg "-server") ;; Server? Start up here. ;; (let* ((tl (launch:setup)) (run-id (and (args:get-arg "-run-id") (string->number (args:get-arg "-run-id"))))) (BB> "megatest -server called; starting server") (if run-id (begin (server:launch run-id (->string *transport-type*)) (set! *didsomething* #t)) (debug:print-error 0 *default-log-port* "server requires run-id be specified with -run-id"))) ;; Not a server? This section will decide how to communicate ;; ;; Setup client for all expect listed here (if (null? (lset-intersection equal? (hash-table-keys args:arg-hash) '("-list-servers" |
︙ | ︙ | |||
780 781 782 783 784 785 786 | (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) (if tl (let* ((tdbdat (tasks:open-db)) (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) | | | | > | | | 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 | (if (or (args:get-arg "-list-servers") (args:get-arg "-stop-server") (args:get-arg "-kill-server")) (let ((tl (launch:setup))) (if tl (let* ((tdbdat (tasks:open-db)) (servers (tasks:get-all-servers (db:delay-if-busy tdbdat))) (fmtstr "~5a~6a~12a~8a~20a~24a~10a~10a~10a~10a\n") (servers-to-kill '()) (kill-switch (if (args:get-arg "-kill-server") "-9" "")) (killinfo (or (args:get-arg "-stop-server") (args:get-arg "-kill-server") )) (khost-port (if killinfo (if (substring-index ":" killinfo)(string-split ":") #f) #f)) (sid (if killinfo (if (substring-index ":" killinfo) #f (string->number killinfo)) #f))) (format #t fmtstr "Id" "RunId" "MTver" "Pid" "Host" "Interface:OutPort" "InPort" "LastBeat" "State" "Transport") (format #t fmtstr "==" "=====" "=====" "===" "====" "=================" "======" "========" "=====" "=========") (for-each (lambda (server) (let* ((id (vector-ref server 0)) (pid (vector-ref server 1)) (hostname (vector-ref server 2)) (interface (vector-ref server 3)) (pullport (vector-ref server 4)) (pubport (vector-ref server 5)) (start-time (vector-ref server 6)) (priority (vector-ref server 7)) (state (vector-ref server 8)) (mt-ver (vector-ref server 9)) (last-update (vector-ref server 10)) (transport (vector-ref server 11)) (run-id (vector-ref server 12)) (killed #f) (status (< last-update 20))) ;; (zmq-sockets (if status (server:client-connect hostname port) #f))) ;; no need to login as status of #t indicates we are connecting to correct ;; server (if (equal? state "dead") (if (> last-update (* 25 60 60)) ;; keep records around for slighly over a day. (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid action: 'delete)) (if (> last-update 20) ;; Mark as dead if not updated in last 20 seconds (tasks:server-deregister (db:delay-if-busy tdbdat) hostname pullport: pullport pid: pid))) (format #t fmtstr id run-id mt-ver pid hostname (conc interface ":" pullport) pubport last-update (if status state "dead") transport) (if (or (equal? id sid) (equal? sid 0)) ;; kill all/any (begin (debug:print-info 0 *default-log-port* "Attempting to kill "kill-switch" server with pid " pid) (tasks:kill-server hostname pid kill-switch: kill-switch))))) servers) (debug:print-info 1 *default-log-port* "Done with listservers") |
︙ | ︙ |
Modified rmt.scm from [51e718f694] to [0f06af5dde].
|
| | | | > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | ;; Copyright 2006-2016, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. ;;====================================================================== (use json format) ;; RADT => purpose of json format?? (declare (unit rmt)) (declare (uses api)) (declare (uses tdb)) (declare (uses http-transport)) (declare (uses rpc-transport)) ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;; ;; ;; For debugging add the following to ~/.megatestrc ;; ;; (require-library trace) |
︙ | ︙ | |||
57 58 59 60 61 62 63 | (debug:print-info 1 *default-log-port* "db write rate too high, starting a server, count=" count " start=" start " run-id=" run-id " queries-per-second=" queries-per-second) #t) #f)))) ;; if a server is either running or in the process of starting call client:setup ;; else return #f to let the calling proc know that there is no server available ;; | > > | > > | > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > | | > > > > > > > | | | < < < < < | < | | > > > > > > | > > | | | | < | > > > > > | | > > | > | | | > > > > > > > > > > > | | > | | | | | | > > | | | | | | | | > > > > > | > | > < < < < < < | > | | | | < < < < < < < < < < < < < < < < < | 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 | (debug:print-info 1 *default-log-port* "db write rate too high, starting a server, count=" count " start=" start " run-id=" run-id " queries-per-second=" queries-per-second) #t) #f)))) ;; if a server is either running or in the process of starting call client:setup ;; else return #f to let the calling proc know that there is no server available ;; (define *rrr-mutex* (make-mutex)) (define (rmt:get-cinfo rid) (mutex-lock! *rrr-mutex*) (let* ((run-id (if rid rid 0)) (cinfo (hash-table-ref/default *runremote* run-id #f))) (mutex-unlock! *rrr-mutex*) cinfo)) (define (rmt:set-cinfo rid server-dat) (mutex-lock! *rrr-mutex*) (let* ((run-id (if rid rid 0)) (res (hash-table-set! *runremote* run-id server-dat))) (mutex-unlock! *rrr-mutex*) res)) (define (rmt:del-cinfo rid) (mutex-lock! *rrr-mutex*) (let* ((run-id (if rid rid 0)) (res (hash-table-delete! *runremote* run-id))) (mutex-unlock! *rrr-mutex*) res)) (define (rmt:get-connection-info-start-server-if-none run-id) (let ((cinfo (rmt:get-cinfo run-id))) (if cinfo cinfo ;; NB// can cache the answer for server running for 10 seconds ... ;; ;; (and (not (rmt:write-frequency-over-limit? cmd run-id)) (if (tasks:server-running-or-starting? (db:delay-if-busy (tasks:open-db)) run-id) (client:setup run-id) #f)))) (define (rmt:run-id->transport-type run-id) (let* ((connection-info (rmt:get-cinfo run-id))) ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) (if connection-info ;; if we already have a connection for this run-id, use that precendent ;; use the server if have connection info (let* ((transport-type (vector-ref connection-info 6))) ;; BB: assumes all transport-type'-servertdat vector's item 6 ids transport type transport-type) ;; otherwise pick the global default as preference. (set in common.scm) *transport-type*))) (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define *rmt:srmutex* (make-mutex)) (define (rmt:send-receive cmd rid params #!key (attemptnum 1)) ;; start attemptnum at 1 so the modulo below works as expected ;; side-effect: clean out old connections (when (eq? (modulo attemptnum 5) 0) (debug:print-error 0 *default-log-port* "rmt:send-receive did not succeed after "(sub1 attemptnum)" tries. Aborting. (cmd="cmd" rid="rid" param="params) (exit 1)) (mutex-lock! *rmt:srmutex*) ;; deadlock is here! ;; expire connections (let ((expire-time (- (current-seconds) (server:get-timeout) 60))) ;; don't forget the 60 second margin (for-each (lambda (run-id) (let ((connection (rmt:get-cinfo run-id))) (if (and (vector? connection) (< (http-transport:server-dat-get-last-access connection) expire-time)) ;; BB> BBTODO: make this generic, not http transport specific. (begin (debug:print-info 0 *default-log-port* "Discarding connection to server for run-id " run-id ", too long between accesses") (hash-table-delete! *runremote* run-id))))) (hash-table-keys *runremote*))) (let* ((run-id (if rid rid 0)) (connection-info (rmt:get-connection-info-start-server-if-none run-id))) ;; the nmsg method does the encoding under the hood (the http method should be changed to do this also) ;;(BB> "in rmt:send-receive; run-id="run-id";;connection-info="connection-info) (if connection-info ;; use the server if have connection info (let* ((transport-type (rmt:run-id->transport-type run-id)) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Here, we make request to remote server ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; (dat (begin (case transport-type ;; BB: replaced *transport-type* global with run-id specific transport-type ((http)(condition-case (http-transport:client-api-send-receive run-id connection-info cmd params) ((commfail)(vector #f "communications fail")) ((exn)(vector #f "other fail")))) ((rpc) (rpc-transport:client-api-send-receive run-id connection-info cmd params)) ;; BB: let us error out for now (else (debug:print-error 0 *default-log-port* "(1) Transport [" transport-type "] specified for run-id [" run-id "] is not implemented in rmt:send-receive. Cannot proceed." (symbol? transport-type)) (vector #f (conc "transport ["transport-type"] unimplemented")))))) (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) ;;(BB> "in rmt:send-receive; transport-type="transport-type" success="success" connection-info="connection-info" res="res " dat="dat) (if (and success (vector? connection-info)) (http-transport:server-dat-update-last-access connection-info)) ;; BB> BBTODO: make this generic, not http transport specific. (if success (begin (mutex-unlock! *rmt:srmutex*) ;; (mutex-unlock! *send-receive-mutex*) (case transport-type ((http rpc) res) ;; (db:string->obj res)) (else (debug:print-error 0 *default-log-port* "(2) Transport [" transport-type "] specified for run-id [" run-id "] is not implemented in rmt:send-receive. Cannot proceed. Also unexpected since this branch follows success which would follow a suported transport...") #f) )) ;; (vector-ref res 1))) ;; no success... (begin ;; let ((new-connection-info (client:setup run-id))) (debug:print 0 *default-log-port* "WARNING: Communication failed, trying call to rmt:send-receive again.") (mutex-unlock! *rmt:srmutex*) (rmt:del-cinfo run-id) ;; don't keep using the same connection (rmt:send-receive cmd rid params attemptnum: attemptnum) ;; (case transport-type ;; ((http rpc) ;; ;; NOTE: killing server causes this process to block forever. No idea why. Dec 2. ;; ;; (if (eq? (modulo attemptnum 5) 0) ;; ;; (tasks:kill-server-run-id run-id tag: "api-send-receive-failed")) ;; ;; (mutex-unlock! *send-receive-mutex*) ;; close the mutex here to allow other threads access to communications ;; (tasks:start-and-wait-for-server (tasks:open-db) run-id 15) ;; (thread-sleep! 5) ;; ;; (nmsg-transport:client-api-send-receive run-id connection-info cmd param remtries: (- remtries 1)))))) ;; ;; no longer killing the server in http-transport:client-api-send-receive ;; ;; may kill it here but what are the criteria? ;; ;; start with three calls then kill server ;; ;; (if (eq? attemptnum 3)(tasks:kill-server-run-id run-id)) ;; ;; (thread-sleep! 2) ;; (rmt:send-receive cmd run-id params attemptnum: (+ attemptnum 1))) ;; (else ;; (debug:print-error 0 *default-log-port* "(3) Transport [" transport-type ;; "] specified for run-id [" run-id ;; "] is not implemented in rmt:send-receive. Cannot proceed.") ;; (exit 1))) ))) ;; no connection info; try to start a server ;; ;; Note: The tasks db was checked for a server in starting mode in the rmt:get-connection-info call ;; (begin (mutex-unlock! *rmt:srmutex*) (tasks:start-and-wait-for-server (db:delay-if-busy (tasks:open-db)) run-id 10) (thread-sleep! (random 5)) ;; give some time to settle and minimize collison? (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1)))))) (define (rmt:update-db-stats run-id rawcmd params duration) (mutex-lock! *db-stats-mutex*) (handle-exceptions exn (begin (debug:print 0 *default-log-port* "WARNING: stats collection failed in update-db-stats") |
︙ | ︙ | |||
224 225 226 227 228 229 230 | (if (> tot 10) (cons newmax-cmd currmax) (cons 'none 0)) (loop (car tal)(cdr tal) newmax-cmd currmax))))))) (mutex-unlock! *db-stats-mutex*) res)) | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < > > | | | | > > > > > > > > > > > > | | 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 | (if (> tot 10) (cons newmax-cmd currmax) (cons 'none 0)) (loop (car tal)(cdr tal) newmax-cmd currmax))))))) (mutex-unlock! *db-stats-mutex*) res)) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) ;; (jparams (db:obj->string params)) ;; (rmt:dat->json-str params)) (res (case (rmt:run-id->transport-type run-id) ((http) (handle-exceptions exn #f (http-transport:client-api-send-receive run-id connection-info cmd params))) ((rpc) (handle-exceptions exn #f (rpc-transport:client-api-send-receive run-id connection-info cmd params))) (else (debug:print-error 0 *default-log-port* "(4) Transport [" *transport-type* "] specified for run-id [" run-id "] is not implemented in rmt:send-receive-no-auto-client-setup. Cannot proceed.") (exit 1))))) ;; ((commfail) (vector #f "communications fail"))))) (if (and res (vector-ref res 0)) (vector-ref res 1) ;;; YES!! THIS IS CORRECT!! CHANGE IT HERE, THEN CHANGE rmt:send-receive ALSO!!! #f))) ;; (db:string->obj (vector-ref dat 1)) ;; (begin ;; (debug:print-error 0 *default-log-port* "rmt:send-receive-no-auto-client-setup failed, attempting to continue. Got " dat) ;; dat)))) |
︙ | ︙ | |||
308 309 310 311 312 313 314 | (define (rmt:login run-id) (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*))) ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; (define (rmt:login-no-auto-client-setup connection-info run-id) | < | < < | 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 | (define (rmt:login run-id) (rmt:send-receive 'login run-id (list *toppath* megatest-version run-id *my-client-signature*))) ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; (define (rmt:login-no-auto-client-setup connection-info run-id) (rmt:send-receive-no-auto-client-setup connection-info 'login run-id (list *toppath* megatest-version run-id *my-client-signature*))) ;; hand off a call to one of the db:queries statements ;; added run-id to make looking up the correct db possible ;; (define (rmt:general-call stmtname run-id . params) (rmt:send-receive 'general-call run-id (append (list stmtname run-id) params))) |
︙ | ︙ |
Modified rpc-transport.scm from [62a65daa58] to [46793bd282].
1 |
| | | 1 2 3 4 5 6 7 8 9 | ;; Copyright 2006-2016, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. |
︙ | ︙ | |||
19 20 21 22 23 24 25 26 27 | (declare (uses common)) (declare (uses db)) (declare (uses tests)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. (include "common_records.scm") (include "db_records.scm") ;; procstr is the name of the procedure to be called as a string | > > > > | > > > > > > > > > > > > > > > > > > > > > > > > | | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > | > > > > > > > | > > > > > > > > | > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > < < | > > | | > > > > > | < | | | > > > > > | > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > | > > > > > > > > > > > > > | | < > | < > > > > > > | > | | > > > | | > > > > > > > > > > > > > > | > > | | | > | > > > > > > | | | | | | | | > > > > > > > > > > > > > | < < < | | | | > > > > > | < < < < > | > > > > > > > > > > > | | | | | > > > > > > > | > > > > > > > > > > > > > | > > > > > > > | | | > > | < > | > > > > > > > > > | | > > | > > > > | > > > > > > > > > > > | > > > > > | > > > > > > > > > > > > | > | | | > > > | > | | < | | | < < < < | < < | | | < < | < < < < < < < < < < < | < > | | > | > > | | < < < | < < < < < > | < | > | < < < > | < < < < | > > > > | | > | | | > > > | > > | | > | 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 | (declare (uses common)) (declare (uses db)) (declare (uses tests)) (declare (uses tasks)) ;; tasks are where stuff is maintained about what is running. (include "common_records.scm") (include "db_records.scm") (define *heartbeat-mutex* (make-mutex)) (define *server-loop-heart-beat* (current-seconds)) ;; procstr is the name of the procedure to be called as a string (define (rpc-transport:autoremote procstr params) ;; may be unused, I think api-exec deprecates this one. (let* ((procsym (if (symbol? procstr) procstr (string->symbol (->string procstr)))) (res (begin (apply (eval procsym) params)))) res)) ;; rpc receiver (define (rpc-transport:api-exec cmd params) (let* ( (resdat (api:execute-requests *inmemdb* (vector cmd params))) ;; #( flag result ) (flag (vector-ref resdat 0)) (res (vector-ref resdat 1))) (mutex-lock! *heartbeat-mutex*) (set! *last-db-access* (current-seconds)) ;; bump *last-db-access*; this will renew keep-running thread's lease on life for another (server:get-timeout) seconds ;;(BB> "in api-exec; last-db-access updated to "*last-db-access*) (mutex-unlock! *heartbeat-mutex*) res)) ;; (handle-exceptions ;; exn ;; (begin ;; (debug:print 0 *default-log-port* "Remote failed for " proc " " params " exn="exn) ;; (apply (eval (string->symbol procstr)) params)) ;; ;; (if *runremote* ;; ;; (apply (eval (string->symbol (conc "remote:" procstr))) params) ;; (apply (eval (string->symbol procstr)) params))) ;; retry an operation (depends on srfi-18) ;; ================== ;; idea here is to avoid spending time on coding retrying something. Trying to be generic here. ;; ;; Exception handling: ;; ------------------- ;; if evaluating the thunk results in exception, it will be retried. ;; on last try, if final-failure-returns-actual is true, the exception will be re-thrown to caller. ;; ;; look at options below #!key to see how to configure behavior ;; ;; (define (retry-thunk the-thunk #!key ;;;; options below (accept-result? (lambda (x) x)) ;; retry if predicate applied to thunk's result is false (retries 4) ;; how many tries (failure-value #f) ;; return this on final failure, unless following option is enabled: (final-failure-returns-actual #f) ;; on failure, on the last try, just return the result, not failure-value (retry-delay 0.1) ;; delay between tries (back-off-factor 1) ;; multiply retry-delay by this factor on retry (random-delay 0.1) ;; add a random portion of this value to wait (chatty #f) ;; print status as we go, for debugging. ) (when chatty (print) (print "Entered retry-thunk") (print "-=-=-=-=-=-")) (let* ((guarded-thunk ;; we are guarding the thunk against exceptions. We will record whether result of evaluation is an exception or a regular result. (lambda () (let* ((EXCEPTION (gensym)) ;; using gensym to avoid potential collision (res (condition-case (the-thunk) ;; this is what we are guarding the execution of [x () (cons EXCEPTION x)] ))) (cond ((and (pair? res) (eq? (car res) EXCEPTION)) (if chatty (print " - the-thunk threw exception >"(cdr res)"<")) (cons 'exception (cdr res))) (else (if chatty (print " - the-thunk returned result >"res"<")) (cons 'regular-result res))))))) (let loop ((guarded-res (guarded-thunk)) (retries-left retries) (fail-wait retry-delay)) (if chatty (print " ==========")) (let* ((wait-time (+ fail-wait (+ (* fail-wait back-off-factor) (* random-delay (/ (random 1024) 1024) )))) (res-type (car guarded-res)) (res-value (cdr guarded-res))) (cond ((and (eq? res-type 'regular-result) (accept-result? res-value)) (if chatty (print " + return result that satisfied accept-result? >"res-value"<")) res-value) ((> retries-left 0) (if chatty (print " - sleep "wait-time)) (thread-sleep! wait-time) (if chatty (print " + retry ["retries-left" tries left]")) (loop (guarded-thunk) (sub1 retries-left) wait-time)) ((eq? res-type 'regular-result) (if final-failure-returns-actual (begin (if chatty (print " + last try failed- return the result >"res-value"<")) res-value) (begin (if chatty (print " + last try failed- return canned failure value >"failure-value"<")) failure-value))) (else ;; no retries left; result was not accepted and res-type can only be 'exception (if final-failure-returns-actual (begin (if chatty (print " + last try failed with exception- re-throw it >"res-value"<")) (abort res-value)); re-raise the exception. TODO: find a way for call-history to show as though from entry to this function (begin (if chatty (print " + last try failed with exception- return canned failure value >"failure-value"<")) failure-value)))))))) (define (rpc-transport:server-shutdown server-id rpc:listener #!key (from-on-exit #f)) (on-exit (lambda () #t)) ;; turn off on-exit stuff ;;(tcp-close rpc:listener) ;; gotta exit nicely ;;(tasks:bb-server-set-state! server-id "stopped") ;; TODO: (low) the following is extraordinaritly slow. Maybe we don't even need portlogger for rpc anyway?? the exception-based failover when ports are taken is fast! ;;(portlogger:open-run-close portlogger:set-port (rpc:default-server-port) "released") (set! *time-to-exit* #t) (if *inmemdb* (db:sync-touched *inmemdb* *run-id* force-sync: #t)) (tasks:bb-server-delete-record server-id " rpc-transport:keep-running complete") ;;(BB> "Before (exit) (from-on-exit="from-on-exit")") (unless from-on-exit (exit)) ;; sometimes we hang (around) here with 100% cpu. ;;(BB> "After") ;; strace reveals endless: ;; getrusage(RUSAGE_SELF, {ru_utime={413, 917868}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 9874}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 13874}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 105880}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 109880}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 201886}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 205886}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 297892}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 301892}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 393898}, ru_stime={0, 60003}, ...}) = 0 ;; getrusage(RUSAGE_SELF, {ru_utime={414, 397898}, ru_stime={0, 60003}, ...}) = 0 ;; make a post to chicken-users w/ http://paste.call-cc.org/paste?id=60a4b66a29ccf7d11359ea866db642c970735978 (if from-on-exit ;; avoid above condition! End current process externally since 1 in 20 (exit)'s result in hung, 100% cpu zombies. (see above) (system (conc "kill -9 "(current-process-id)))) ) ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (rpc-transport:launch run-id) (set! *run-id* run-id) ;; ;; send to background if requested ;; (when (args:get-arg "-daemonize") ;; (daemon:ize) ;; (when *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it ;; (current-error-port *alt-log-file*) ;; (current-output-port *alt-log-file*))) ;; double check we dont alrady have a running server for this run-id (when (server:check-if-running run-id) (debug:print 0 *default-log-port* "INFO: Server for run-id " run-id " already running") (exit 0)) ;; let's get a server-id for this server ;; if at first we do not suceed, try 3 more times. (let ((server-id (retry-thunk (lambda () (tasks:bb-server-lock-slot run-id 'rpc)) chatty: #f retries: 4))) (when (not server-id) ;; dang we couldn't get a server-id. ;; since we didn't get the server lock we are going to clean up and bail out (debug:print-info 2 *default-log-port* "INFO: server pid=" (current-process-id) ", hostname=" (get-host-name) " not starting due to other candidates ahead in start queue") (tasks:bb-server-delete-records-for-this-pid " rpc-transport:launch") (exit 1)) ;; we got a server-id (and a corresponding entry in servers table in globally shared mdb) ;; all systems go. Proceed to setup rpc server. (rpc-transport:run (if (args:get-arg "-server") (args:get-arg "-server") "-") run-id server-id) (exit))) (define *rpc-listener-port* #f) (define *rpc-listener-port-bind-timestamp* #f) (define *on-exit-flag #f) (define (rpc-transport:server-dat-get-iface vec) (vector-ref vec 0)) (define (rpc-transport:server-dat-get-port vec) (vector-ref vec 1)) (define (rpc-transport:server-dat-get-last-access vec) (vector-ref vec 5)) (define (rpc-transport:server-dat-get-transport vec) (vector-ref vec 6)) (define (rpc-transport:server-dat-update-last-access vec) (if (vector? vec) (vector-set! vec 5 (current-seconds)) (begin (print-call-chain (current-error-port)) (debug:print-error 0 *default-log-port* "call to rpc-transport:server-dat-update-last-access with non-vector!!")))) (define *api-exec-ht* (make-hash-table)) ;; let's see if caching the rpc stub curbs thread-profusion on server side (define (rpc-transport:get-api-exec iface port) (let* ((lu (hash-table-ref/default *api-exec-ht* (cons iface port) #f))) (if lu lu (let ((res (rpc:procedure 'api-exec iface port))) (hash-table-set! *api-exec-ht* (cons iface port) res) res)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; this client-side procedure makes rpc call to server and returns result ;; (define (rpc-transport:client-api-send-receive run-id serverdat cmd params #!key (numretries 3)) ;;(BB> "entered rpc-transport:client-api-send-receive with run-id="run-id " serverdat="serverdat" cmd="cmd" params="params" numretries="numretries) (if (not (vector? serverdat)) (begin (BB> "WHAT?? for run-id="run-id", serverdat="serverdat) (print-call-chain) (exit 1))) (let* ((iface (rpc-transport:server-dat-get-iface serverdat)) (port (rpc-transport:server-dat-get-port serverdat)) (res #f) (api-exec (rpc-transport:get-api-exec iface port)) ;; chached by host/port. may need to clear... (send-receive (lambda () (tcp-buffer-size 0) (set! res (retry-thunk (lambda () (condition-case ;;(vector #t (run-remote cmd params)) (vector 'success (api-exec cmd params)) [x (exn i/o net) (vector 'comms-fail (conc "communications fail ["(->string x)"]") x)] [x () (vector 'other-fail "other fail ["(->string x)"]" x)])) chatty: #f accept-result?: (lambda(x) (and (vector? x) (vector-ref x 0))) retries: 4 back-off-factor: 1.5 random-wait: 0.2 retry-delay: 0.1 final-failure-returns-actual: #t)) ;;(BB> "HEY res="res) res )) (th1 (make-thread send-receive "send-receive")) (time-out-reached #f) (time-out (lambda () (thread-sleep! 45) (set! time-out-reached #t) (thread-terminate! th1) #f)) (th2 (make-thread time-out "time out"))) (thread-start! th1) (thread-start! th2) (thread-join! th1) (thread-terminate! th2) ;;(BB> "alt got res="res) (debug:print-info 11 *default-log-port* "got res=" res) (if (vector? res) (case (vector-ref res 0) ((success) (vector #t (vector-ref res 1))) ((comms-fail) (debug:print 0 *default-log-port* "WARNING: comms failure for rpc request >>"res"<<") ;;(debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector-ref res 1))) (else (debug:print-error 0 *default-log-port* "error occured at server, info=" (vector-ref res 1)) (debug:print 0 *default-log-port* " client call chain:") (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " server call chain:") (pp (vector-ref res 1) (current-error-port)) (signal (vector-ref res 2)))) (signal (make-composite-condition (make-property-condition 'timeout 'message "nmsg-transport:client-api-send-receive-raw timed out talking to server")))))) (define (rpc-transport:run hostn run-id server-id) (debug:print 2 *default-log-port* "Attempting to start the rpc server ...") ;; (trace rpc:publish-procedure!) ;;====================================================================== ;; start of publish-procedure section ;;====================================================================== (rpc:publish-procedure! 'server:login server:login) ;; this allows client to validate it is the same megatest instance as the server. No security here, just making sure we're in the right room. (rpc:publish-procedure! 'testing (lambda () "Just testing")) ;; procedure to receive arbitrary API request from client's rpc:send-receive/rpc-transport:client-api-send-receive (rpc:publish-procedure! 'rpc-transport:autoremote rpc-transport:autoremote) ;; can use this to run most anything at the remote (rpc:publish-procedure! 'api-exec rpc-transport:api-exec) ;;====================================================================== ;; end of publish-procedure section ;;====================================================================== (let* ((db #f) (hostname (let ((res (get-host-name))) res)) (server-start-time (current-seconds)) (server-timeout (server:get-timeout)) (ipaddrstr (let* ((ipstr (if (string=? "-" hostn) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") (server:get-best-guess-address hostname) #f)) (res (if ipstr ipstr hostn))) res)) ;; hostname))) (start-port (let ((res (portlogger:open-run-close portlogger:find-port))) ;; BB> TODO: remove portlogger! res)) (link-tree-path (configf:lookup *configdat* "setup" "linktree")) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; rpc:listener is the tcp-listen result from inside the find-free-port-and-open complex. ;; It is our handle on the listening tcp port ;; We will attach this to our rpc server with rpc:make-server in thread th1 . (rpc:listener (rpc-transport:find-free-port-and-open start-port)) (th1 (make-thread (lambda () ((rpc:make-server rpc:listener) #t) ) "rpc:server")) (hostname (if (string=? "-" hostn) (get-host-name) hostn)) (ipaddrstr (if (string=? "-" hostn) (server:get-best-guess-address hostname) ;; (string-intersperse (map number->string (u8vector->list (hostname->ip hostname))) ".") #f)) (portnum (let ((res (rpc:default-server-port))) res)) (host:port (conc (if ipaddrstr ipaddrstr hostname) ":" portnum))) ;; BB> TODO: remove portlogger! ;; if rpc found it needed a different port than portlogger provided, keep portlogger in the loop. ;; (when (not (equal? start-port portnum)) ;; (BB> "portlogger proffered "start-port" but rpc grabbed "portnum) ;; (portlogger:open-run-close portlogger:set-port start-port "released") ;; (portlogger:open-run-close portlogger:take-port portnum)) (tasks:bb-server-set-interface-port server-id ipaddrstr portnum) ;;============================================================ ;; activate thread th1 to attach opened tcp port to rpc server ;;============================================================= (thread-start! th1) (set! db *inmemdb*) (debug:print 0 *default-log-port* "Server started on " host:port) ;; (thread-sleep! 5) (if (retry-thunk (lambda () (rpc-transport:self-test run-id ipaddrstr portnum))) (debug:print 0 *default-log-port* "INFO: rpc self test passed!") (begin (debug:print 0 *default-log-port* "Error: rpc listener did not pass self test. Shutting down. On: " host:port) (exit))) (on-exit (lambda () (rpc-transport:server-shutdown server-id rpc:listener from-on-exit: #t))) ;; check again for running servers for this run-id in case one has snuck in since we checked last in rpc-transport:launch (if (not (equal? server-id (tasks:bb-server-am-i-the-server? run-id)));; try to ensure no double registering of servers (begin ;; i am not the server, another server snuck in and beat this one to the punch (tcp-close rpc:listener) ;; gotta exit nicely and free up that tcp port (tasks:bb-server-set-state! server-id "collision")) (begin ;; i am the server ;; setup the in-memory db (set! *inmemdb* (db:setup run-id)) (db:get-db *inmemdb* run-id) ;; let's make it official (set! *rpc:listener* rpc:listener) (tasks:bb-server-set-state! server-id "running") ;; update our mdb servers entry ;; this let loop will hold open this thread until we want the server to shut down. ;; if no requests received within the last 20 seconds : ;; database hasnt changed in ?? ;; ;; begin new loop ;; keep-running loop: polls last-db-access to see if we have timed out. (let loop ((count 0) (bad-sync-count 0)) ;; Use this opportunity to sync the inmemdb to db (let ((start-time (current-milliseconds)) (sync-time #f) (rem-time #f)) ;; inmemdb is a dbstruct (condition-case (db:sync-touched *inmemdb* *run-id* force-sync: #t) ((sync-failed)(cond ((> bad-sync-count 10) ;; time to give up (rpc-transport:server-shutdown server-id rpc:listener)) (else ;; (> bad-sync-count 0) ;; we've had a fail or two, delay and loop (thread-sleep! 5) (loop count (+ bad-sync-count 1))))) ((exn) (debug:print-error 0 *default-log-port* "error from sync code other than 'sync-failed. Attempting to gracefully shutdown the server") (rpc-transport:server-shutdown server-id rpc:listener))) (set! sync-time (- (current-milliseconds) start-time)) (set! rem-time (quotient (- 4000 sync-time) 1000)) (debug:print 4 *default-log-port* "SYNC: time= " sync-time ", rem-time=" rem-time) (if (and (<= rem-time 4) (> rem-time 0)) (thread-sleep! rem-time) (thread-sleep! 4))) ;; fallback for if the math is changed ... (if (< count 1) ;; 3x3 = 9 secs aprox (loop (+ count 1) bad-sync-count)) ;; BB: don't see how this is possible with RPC ;; ;; Check that iface and port have not changed (can happen if server port collides) ;; (mutex-lock! *heartbeat-mutex*) ;; (set! sdat *server-info*) ;; (mutex-unlock! *heartbeat-mutex*) ;; (if (or (not (equal? sdat (list iface port))) ;; (not server-id)) ;; (begin ;; (debug:print-info 0 *default-log-port* "interface changed, refreshing iface and port info") ;; (set! iface (car sdat)) ;; (set! port (cadr sdat)))) ;; Transfer *last-db-access* to last-access to use in checking that we are still alive (mutex-lock! *heartbeat-mutex*) (set! last-access *last-db-access*) (mutex-unlock! *heartbeat-mutex*) ;; (debug:print 11 *default-log-port* "last-access=" last-access ", server-timeout=" server-timeout) ;; ;; no_traffic, no running tests, if server 0, no running servers ;; ;; (let ((wait-on-running (configf:lookup *configdat* "server" b"wait-on-running"))) ;; wait on running tasks (if not true then exit on time out) ;; (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) (adjusted-timeout (if (> hrs-since-start 1) (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (if (and *server-run* (> (+ last-access server-timeout) (current-seconds))) (begin (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access))) ;; ;; Consider implementing some smarts here to re-insert the record or kill self is ;; the db indicates so ;; (if (tasks:bb-server-am-i-the-server? run-id) (tasks:bb-server-set-state! server-id "running")) ;; (loop 0 bad-sync-count)) (begin ;;(BB> "SERVER SHUTDOWN CALLED! last-access="last-access" current-seconds="(current-seconds)" server-timeout="server-timeout) (rpc-transport:server-shutdown server-id rpc:listener))))) ;; end new loop )))) (define (rpc-transport:find-free-port-and-open port #!key ) (handle-exceptions exn (begin (print "Failed to bind to port " (rpc:default-server-port) ", trying next port") (rpc-transport:find-free-port-and-open (add1 port))) (rpc:default-server-port port) (set! *rpc-listener-port* port) ;; a bit paranoid about rpc:default-server-port parameter not changing across threads (as params are wont to do). keeping this global in my back pocket in case this causes problems (set! *rpc-listener-port-bind-timestamp* (current-milliseconds)) ;; may want to test how long it has been since the last bind attempt happened... (tcp-read-timeout 240000) (tcp-buffer-size 0) ;; gotta do this because http-transport undoes it. (tcp-listen (rpc:default-server-port) 10000) )) (define (rpc-transport:ping run-id host port) (handle-exceptions exn (begin (print "SERVER_NOT_FOUND exn="exn) (exit 1)) (let ((login-res ((rpc:procedure 'server:login host port) *toppath*))) (if login-res (begin (print "LOGIN_OK") (exit 0)) (begin (print "LOGIN_FAILED") (exit 1)))))) (define (rpc-transport:self-test run-id host port) (tcp-buffer-size 0) ;; gotta do this because http-transport undoes it. (let* ((testing-res ((rpc:procedure 'testing host port))) (login-res ((rpc:procedure 'server:login host port) *toppath*)) (res (and login-res (equal? testing-res "Just testing")))) (if login-res (begin ;;(BB> "Self test PASS. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) #t) (begin ;;(BB> "Self test fail. login-res="login-res" testing-res="testing-res" *toppath*="*toppath*) #f)) res)) (define (rpc-transport:client-setup run-id server-dat #!key (remtries 10)) ;;(BB> "entered rpc-transport:client-setup with run-id="run-id" and server-dat="server-dat" and retries="remtries) (tcp-buffer-size 0) (debug:print-info 0 *default-log-port* "rpc-transport:client-setup run-id="run-id" server-dat=" server-dat ", remaining-tries=" remtries) (let* ((iface (tasks:hostinfo-get-interface server-dat)) (hostname (tasks:hostinfo-get-hostname server-dat)) (port (tasks:hostinfo-get-port server-dat)) (runremote-server-dat (vector iface port #f #f #f (current-seconds) 'rpc)) ;; http version := (vector iface port api-uri api-url api-req (current-seconds) 'http ) (ping-res (retry-thunk (lambda () ;; make 3 attempts to ping. ((rpc:procedure 'server:login iface port) *toppath*)) chatty: #f retries: 3))) ;; we got here from rmt:get-connection-info on the condition that *runremote* has no entry for run-id... (if ping-res (begin (debug:print-info 0 *default-log-port* "rpc-transport:client-setup CONNECTION ESTABLISHED run-id="run-id" server-dat=" server-dat) (rmt:set-cinfo run-id runremote-server-dat) ;; (hash-table-set! *runremote* run-id runremote-server-dat) ;; side-effect - *runremote* cache init fpr rmt:* runremote-server-dat) (begin ;; login failed but have a server record, clean out the record and try again (debug:print-info 0 *default-log-port* "rpc-transport:client-setup UNABLE TO CONNECT run-id="run-id" server-dat=" server-dat) (tasks:kill-server-run-id run-id) (tasks:bb-server-force-clean-run-record run-id iface port " rpc-transport:client-setup (server-dat = #t)") (if (> remtries 2) (thread-sleep! (+ 1 (random 5))) ;; spread out the starts a little (thread-sleep! (+ 15 (random 20)))) ;; it isn't going well. give it plenty of time (server:try-running run-id) (thread-sleep! 5) ;; give server a little time to start up (client:setup run-id remaining-tries: (sub1 remtries)))))) |
Modified server.scm from [19061b35b0] to [301314345c].
1 |
| | | 1 2 3 4 5 6 7 8 9 | ;; Copyright 2006-2016, Matthew Welland. ;; ;; This program is made available under the GNU GPL version 2.0 or ;; greater. See the accompanying file COPYING for details. ;; ;; This program is distributed WITHOUT ANY WARRANTY; without even the ;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ;; PURPOSE. |
︙ | ︙ | |||
46 47 48 49 50 51 52 | ;; ;; all routes though here end in exit ... ;; ;; start_server ;; (define (server:launch run-id transport-type) | | | | | | | | | > | | < < | | | | | | | > > > > > > > < < < < | | < | | 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | ;; ;; all routes though here end in exit ... ;; ;; start_server ;; (define (server:launch run-id transport-type) (let ((ttype (if (symbol? transport-type) transport-type (string->symbol (->string transport-type))))) (case ttype ((http)(http-transport:launch run-id)) ;;((nmsg)(nmsg-transport:launch run-id)) ((rpc) (rpc-transport:launch run-id)) (else (debug:print-error 0 *default-log-port* "unknown server type " ttype))))) ;; (else (debug:print-error 0 *default-log-port* "No known transport set, transport=" transport ", using rpc") ;; (rpc-transport:launch run-id))))) ;;====================================================================== ;; S E R V E R U T I L I T I E S ;;====================================================================== ;; set global *transport-type* based on -transport switch and serer/transport configuration. default http otherwise. ;; called by launch:setup (define (server:set-transport) (let ((ttype (string->symbol (or (args:get-arg "-transport") (configf:lookup *configdat* "server" "transport") "rpc")))) (set! *transport-type* ttype) ttype)) ;; Get the transport -- DO NOT call this from client code. In client code, this is run-id sensitive and not a global ;; For code communicating with existing run-id with a server, use: (rmt:run-id->transport-type run-id) (define (server:get-transport) (if *transport-type* *transport-type* (server:set-transport))) ;; Generate a unique signature for this server (define (server:mk-signature) (message-digest-string (md5-primitive) (with-output-to-string (lambda () (write (list (current-directory) (argv))))))) ;; When using zmq this would send the message back (two step process) ;; with spiffy or rpc this simply returns the return data to be returned ;; (define (server:reply return-addr query-sig success/fail result) (debug:print-info 11 *default-log-port* "server:reply return-addr=" return-addr ", result=" result) ;; (send-message pubsock target send-more: #t) ;; (send-message pubsock (case (server:get-transport) ((rpc) (db:obj->string (vector success/fail query-sig result))) ((http) (db:obj->string (vector success/fail query-sig result))) ((fs) result) (else (debug:print-error 0 *default-log-port* "unrecognised transport type: " *transport-type*) result))) ;; Given a run id start a server process ### NOTE ### > file 2>&1 ;; if the run-id is zero and the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; (define (server:run run-id) (let* ((curr-host (get-host-name)) (curr-ip (server:get-best-guess-address curr-host)) (target-host (configf:lookup *configdat* "server" "homehost" )) (testsuite (common:get-testsuite-name)) (logfile (conc *toppath* "/logs/" run-id ".log")) (cmdln (conc (common:get-megatest-exe) " -server " (or target-host "-") " -run-id " run-id " -log " logfile " -m testsuite:" testsuite))) (debug:print 0 *default-log-port* "INFO: Starting server (" cmdln ") as none running ...") (push-directory *toppath*) (if (not (directory-exists? "logs"))(create-directory "logs")) ;; Rotate logs, logic: ;; if > 500k and older than 1 week: ;; remove previous compressed log and compress this log |
︙ | ︙ | |||
178 179 180 181 182 183 184 | (define (server:try-running run-id) (if (eq? run-id 0) (server:run run-id) (rmt:start-server run-id))) (define (server:check-if-running run-id) (let ((tdbdat (tasks:open-db))) | | > | > | | | > > > > | | | 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 | (define (server:try-running run-id) (if (eq? run-id 0) (server:run run-id) (rmt:start-server run-id))) (define (server:check-if-running run-id) (let ((tdbdat (tasks:open-db))) (let loop ((server (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)) (trycount 0)) (if server ;; note: client:start will set *runremote*. this needs to be changed ;; also, client:start will login to the server, also need to change that. ;; ;; client:start returns #t if login was successful. ;; (let* ((transport-type (rmt:run-id->transport-type run-id)) (res (case transport-type ((http)(server:ping-server run-id (tasks:hostinfo-get-interface server) (tasks:hostinfo-get-port server))) ((rpc) (rpc-transport:ping run-id (tasks:hostinfo-get-interface server) (tasks:hostinfo-get-port server))) (else (debug:print-error 0 *default-log-port* "(5) Transport [" transport-type "] specified for run-id [" run-id "] is not implemented in rmt:send-receive. Cannot proceed.") (exit 1))))) ;; if the server didn't respond we must remove the record (if res #t (begin (debug:print-info 0 *default-log-port* "server at " server " not responding, removing record") (tasks:server-force-clean-running-records-for-run-id (db:delay-if-busy tdbdat) run-id " server:check-if-running") res))) #f)))) ;; called in megatest.scm, host-port is string hostname:port ;; (define (server:ping run-id host:port) (let ((tdbdat (tasks:open-db))) (let* ((host-port (let ((slst (string-split host:port ":"))) (if (eq? (length slst) 2) (list (car slst)(string->number (cadr slst))) #f))) (toppath (launch:setup)) (server-db-dat (if (not host-port)(tasks:get-server-info (db:delay-if-busy tdbdat) run-id) #f))) (if (not run-id) (begin (debug:print-error 0 *default-log-port* "must specify run-id when doing ping, -run-id n") (print "ERROR: No run-id") (exit 1)) (if (and (not host-port) (not server-db-dat)) |
︙ | ︙ | |||
252 253 254 255 256 257 258 259 | (if (eof-object? inl) (case (string->symbol res) ((NOREPLY) #f) ((LOGIN_OK) #t) (else #f)) (loop (read-line) inl)))))) (define (server:login toppath) | > > > < | | | | | | | | | | 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 | (if (eof-object? inl) (case (string->symbol res) ((NOREPLY) #f) ((LOGIN_OK) #t) (else #f)) (loop (read-line) inl)))))) ;; Client will call this procedure on the server via the low-level transport (http/rpc/etc) to verify its toppath matches the server's toppath. ;; A true result means client and server are associated with same megatest instance, share the same megatest.config, etc...) A false result means the client should not talk to this server. (define (server:login toppath) (set! *last-db-access* (current-seconds)) (if (equal? *toppath* toppath) (begin ;; (debug:print-info 2 *default-log-port* "login successful") #t) (begin ;; (debug:print-info 2 *default-log-port* "login failed") #f))) (define (server:get-timeout) (let ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (and (string? tmo) (string->number tmo)) (* 60 60 (string->number tmo)) ;; (* 3 24 60 60) ;; default to three days (* 60 3) ;; default to three minutes ;; (* 60 60 25) ;; default to 25 hours ))) |
Modified tasks.scm from [a06114a2ac] to [736f8afe48].
︙ | ︙ | |||
168 169 170 171 172 173 174 | (define (tasks:hostinfo-get-interface vec) (vector-ref vec 1)) (define (tasks:hostinfo-get-port vec) (vector-ref vec 2)) (define (tasks:hostinfo-get-pubport vec) (vector-ref vec 3)) (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) | | | | | | 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | (define (tasks:hostinfo-get-interface vec) (vector-ref vec 1)) (define (tasks:hostinfo-get-port vec) (vector-ref vec 2)) (define (tasks:hostinfo-get-pubport vec) (vector-ref vec 3)) (define (tasks:hostinfo-get-transport vec) (vector-ref vec 4)) (define (tasks:hostinfo-get-pid vec) (vector-ref vec 5)) (define (tasks:hostinfo-get-hostname vec) (vector-ref vec 6)) (define (tasks:server-lock-slot mdb run-id transport-type) (tasks:server-clean-out-old-records-for-run-id mdb run-id " tasks:server-lock-slot") (if (< (tasks:num-in-available-state mdb run-id) 4) (begin (tasks:server-set-available mdb run-id transport-type) (thread-sleep! (/ (random 1500) 1000)) ;; (thread-sleep! 2) ;; Try removing this. It may not be needed. (tasks:server-am-i-the-server? mdb run-id)) #f)) ;; register that this server may come online (first to register goes though with the process) (define (tasks:server-set-available mdb run-id transport-type) (sqlite3:execute mdb "INSERT INTO servers (pid,hostname,port,pubport,start_time, priority,state,mt_version,heartbeat, interface,transport,run_id) VALUES(?, ?, ?, ?, strftime('%s','now'), ?, ?, ?,-1,?, ?, ?);" (current-process-id) ;; pid (get-host-name) ;; hostname -1 ;; port -1 ;; pubport (random 1000) ;; priority (used a tiebreaker on get-available) "available" ;; state (common:version-signature) ;; mt_version -1 ;; interface ;; (conc (server:get-transport)) ;; transport (conc transport-type) ;; transport run-id )) (define (tasks:num-in-available-state mdb run-id) (let ((res 0)) (sqlite3:for-each-row (lambda (num-in-queue) |
︙ | ︙ | |||
289 290 291 292 293 294 295 296 297 298 299 300 301 302 | (remtries 100)) (if (member port used-ports) (if (> remtries 0) (loop (get-rand-port)(- remtries 1)) (get-rand-port)) port)))))) (define (tasks:server-am-i-the-server? mdb run-id) (let* ((all (tasks:server-get-servers-vying-for-run-id mdb run-id)) (first (if (null? all) #f;; (begin (debug:print-error 0 *default-log-port* "no servers listed, should be at least one by now.") ;; (sqlite3:finalize! mdb) ;; (exit 1)) (car (db:get-rows all))))) | > > > > | 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 | (remtries 100)) (if (member port used-ports) (if (> remtries 0) (loop (get-rand-port)(- remtries 1)) (get-rand-port)) port)))))) ;; there can be multiple servers spawned for the same runid. we want exactly zero or one servers per runid. The caller is a nascent server. It wants to know if it should proceed or if it is redundant. this function chooses a winner and tells me if I am the winner. Alternative is lots of runaway servers. Nobody wants that, trust me. ;; ;; algo: get all server info entries for this runid. Each nascent server will insert an entry for its runid before getting here. Entries are visible globally. If current hostname and current processid match first entry, then yes I am the server; return server-id as my prize for winning. Otherwise, I am not the server; return #f. ;; (define (tasks:server-am-i-the-server? mdb run-id) (let* ((all (tasks:server-get-servers-vying-for-run-id mdb run-id)) (first (if (null? all) #f;; (begin (debug:print-error 0 *default-log-port* "no servers listed, should be at least one by now.") ;; (sqlite3:finalize! mdb) ;; (exit 1)) (car (db:get-rows all))))) |
︙ | ︙ | |||
325 326 327 328 329 330 331 | (lambda (a . b) (set! res (cons (apply vector a b) res))) mdb (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;") run-id) (vector header res))) | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | | | | 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 | (lambda (a . b) (set! res (cons (apply vector a b) res))) mdb (conc "SELECT " selstr " FROM servers WHERE run_id=? AND state in ('available','running','dbprep') ORDER BY start_time DESC;") run-id) (vector header res))) ;; BB> bb opinion - want to push responsibility into api (encapsulation), like waiting if db is busy and finding the db handle in the first place. why should the caller need to be concerned?? If my opinion carries, we'll remove the bb- and make other needful adjustments. (define (bb-mdb-inserter mdb-expecting-proc mdbless-args) (let ((mdb (db:delay-if-busy (tasks:open-db)))) (apply mdb-expecting-proc (cons mdb mdbless-args)))) (define (tasks:bb-server-force-clean-run-record . args) (bb-mdb-inserter tasks:server-force-clean-run-record args)) (define (tasks:bb-server-lock-slot . args) (bb-mdb-inserter tasks:server-lock-slot args)) (define (tasks:bb-server-set-interface-port . args) (bb-mdb-inserter tasks:server-set-interface-port args)) (define (tasks:bb-server-am-i-the-server? . args) (bb-mdb-inserter tasks:server-am-i-the-server? args)) (define (tasks:bb-server-set-state! . args) (bb-mdb-inserter tasks:server-set-state! args)) (define (tasks:bb-get-server-info . args) (bb-mdb-inserter tasks:get-server-info args)) (define (tasks:bb-num-in-available-state . args) (bb-mdb-inserter tasks:num-in-available-state args)) (define (tasks:bb-server-delete-records-for-this-pid . args) (bb-mdb-inserter tasks:server-delete-records-for-this-pid args)) (define (tasks:bb-server-delete-record . args) (bb-mdb-inserter tasks:server-delete-record args)) ;; BB: renaming tasks:get-server to get-server-info to make clear we aren't creating servers here (define (tasks:get-server-info mdb run-id #!key (retries 10)) (let ((res #f) (best #f)) (handle-exceptions exn (begin (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* "WARNING: tasks:get-server-info db access error.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 0 *default-log-port* " for run " run-id) (print-call-chain (current-error-port)) (if (> retries 0) (begin (debug:print 0 *default-log-port* " trying call to tasks:get-server-info again in 10 seconds") (thread-sleep! 10) (tasks:get-server-info mdb run-id retries: (- retries 0))) (debug:print 0 *default-log-port* "10 tries of tasks:get-server-info all crashed and burned. Giving up and returning \"no server found\""))) (sqlite3:for-each-row (lambda (id interface port pubport transport pid hostname) (set! res (vector id interface port pubport transport pid hostname))) mdb ;; removed: ;; strftime('%s','now')-heartbeat < 10 AND mt_version = ? "SELECT id,interface,port,pubport,transport,pid,hostname FROM servers |
︙ | ︙ | |||
392 393 394 395 396 397 398 | ;; (else ;; #f)))) ;; try to start a server and wait for it to be available ;; (define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; ensure a server is running for this run | | | | 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 | ;; (else ;; #f)))) ;; try to start a server and wait for it to be available ;; (define (tasks:start-and-wait-for-server tdbdat run-id delay-max-tries) ;; ensure a server is running for this run (let loop ((server-dat (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)) (delay-time 0)) (if (and (not server-dat) (< delay-time delay-max-tries)) (begin (if (common:low-noise-print 60 "tasks:start-and-wait-for-server" run-id) (debug:print 0 *default-log-port* "Try starting server for run-id " run-id)) (thread-sleep! (/ (random 2000) 1000)) (server:kind-run run-id) (thread-sleep! (min delay-time 1)) (loop (tasks:get-server-info (db:delay-if-busy tdbdat) run-id)(+ delay-time 1)))))) (define (tasks:get-all-servers mdb) (let ((res '())) (sqlite3:for-each-row (lambda (id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) ;; 0 1 2 3 4 5 6 7 8 9 10 11 12 (set! res (cons (vector id pid hostname interface port pubport start-time priority state mt-version last-update transport run-id) res))) |
︙ | ︙ | |||
441 442 443 444 445 446 447 | (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) | | | | 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 | (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")) ;; look up a server by run-id and send it a kill, also delete the record for that server ;; (define (tasks:kill-server-run-id run-id #!key (tag "default")) (let* ((tdbdat (tasks:open-db)) (sdat (tasks:get-server-info (db:delay-if-busy tdbdat) run-id))) (if sdat (let ((hostname (vector-ref sdat 6)) (pid (vector-ref sdat 5)) (server-id (vector-ref sdat 0))) (tasks:server-set-state! (db:delay-if-busy tdbdat) server-id "killed") (debug:print-info 0 *default-log-port* "Killing server " server-id " for run-id " run-id " on host " hostname " with pid " pid) (tasks:kill-server hostname pid kill-switch: "-9") ;; BB: added -9, let's not be kind here. we need it to die so it isn't a 100% cpu zombie (tasks:server-delete-record (db:delay-if-busy tdbdat) server-id tag) ) (debug:print-info 0 *default-log-port* "No server found for run-id " run-id ", nothing to kill")) ;; (sqlite3:finalize! tdb) )) ;;====================================================================== ;; M O N I T O R S |
︙ | ︙ |
Modified tests/fullrun/megatest.config from [8446f6ae84] to [8124b58630].
︙ | ︙ | |||
151 152 153 154 155 156 157 | [server] # force use of server always # required yes # Use http instead of direct filesystem access | | | 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 | [server] # force use of server always # required yes # Use http instead of direct filesystem access # transport http # transport fs # transport nmsg synchronous 0 # If the server can't be started on this port it will try the next port until # it succeeds |
︙ | ︙ |