Overview
Comment: | Force starting a server and wait for it when launching runs. This prevents server run-away but doesn't fix the underlying issue. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.64 |
Files: | files | file ages | folders |
SHA1: |
ba2401c3f651fbfd95ef94256dfc81f6 |
User & Date: | matt on 2017-03-28 10:48:26 |
Other Links: | branch diff | manifest | tags |
Context
2017-03-28
| ||
10:54 | Force starting a server and wait for it when launching runs. This prevents server run-away but doesn't fix the underlying issue. check-in: 27b1636e7b user: matt tags: v1.64 | |
10:48 | Force starting a server and wait for it when launching runs. This prevents server run-away but doesn't fix the underlying issue. check-in: ba2401c3f6 user: matt tags: v1.64 | |
08:39 | Fixed unit test running. To try do: cd tests;make all-rmt.log. Improved information from a server-side crash. check-in: 263cdeb6eb user: matt tags: v1.64 | |
Changes
Modified api.scm from [ca9ed8f403] to [d7ff6e57f4].
︙ | ︙ | |||
124 125 126 127 128 129 130 | (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") | | | 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) (params (vector-ref dat 1)) (start-t (current-milliseconds)) |
︙ | ︙ |
Modified http-transport.scm from [6eea7c3a25] to [826398cdb7].
︙ | ︙ | |||
223 224 225 226 227 228 229 | ;; send the data and get the response ;; extract the needed info from the http data and ;; process and return it. (let* ((send-recieve (lambda () (mutex-lock! *http-mutex*) ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) ;; ((exn http client-error) e (print e))) | | | > | > | 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | ;; send the data and get the response ;; extract the needed info from the http data and ;; process and return it. (let* ((send-recieve (lambda () (mutex-lock! *http-mutex*) ;; (condition-case (with-input-from-request "http://localhost"; #f read-lines) ;; ((exn http client-error) e (print e))) (set! res (vector ;;; DON'T FORGET - THIS IS THE CLIENT SIDE! NOTE: consider moving this to client.scm since we are only supporting http transport at this time. success (db:string->obj (handle-exceptions exn (let ((call-chain (get-call-chain)) (msg ((condition-property-accessor 'exn 'message) exn))) (set! success #f) (debug:print 0 *default-log-port* "WARNING: failure in with-input-from-request to " fullurl ".") (debug:print 0 *default-log-port* " message: " msg) (debug:print 0 *default-log-port* " cmd: " cmd " params: " params) (if runremote (remote-conndat-set! runremote #f)) ;; Killing associated server to allow clean retry.") ;; (tasks:kill-server-run-id run-id) ;; better to kill the server in the logic that called this routine? (mutex-unlock! *http-mutex*) ;;; (signal (make-composite-condition ;;; (make-property-condition 'commfail 'message "failed to connect to server"))) |
︙ | ︙ |
Modified megatest.scm from [84dec1a162] to [1cdd8cf912].
︙ | ︙ | |||
794 795 796 797 798 799 800 | (begin (debug:print-info 0 *default-log-port* "Attempting to kill server with pid " pid) (server:kill server))))) (sort servers (lambda (a b) (let ((ma (or (any->number (car a)) 9e9)) (mb (or (any->number (car b)) 9e9))) (> ma mb))))) | | | 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 | (begin (debug:print-info 0 *default-log-port* "Attempting to kill server with pid " pid) (server:kill server))))) (sort servers (lambda (a b) (let ((ma (or (any->number (car a)) 9e9)) (mb (or (any->number (car b)) 9e9))) (> ma mb))))) ;; (debug:print-info 1 *default-log-port* "Done with listservers") (set! *didsomething* #t) (exit)) (exit)))) ;; must do, would have to add checks to many/all calls below ;;====================================================================== ;; Weird special calls that need to run *after* the server has started? |
︙ | ︙ |
Modified rmt.scm from [2aa9d28934] to [816af6a52f].
︙ | ︙ | |||
194 195 196 197 198 199 200 | (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end | | | > > > > | 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | (success (if (vector? dat) (vector-ref dat 0) #f)) (res (if (vector? dat) (vector-ref dat 1) #f))) (if (vector? conninfo)(http-transport:server-dat-update-last-access conninfo)) ;; refresh access time ;; (mutex-unlock! *rmt-mutex*) (debug:print-info 13 *default-log-port* "rmt:send-receive, case 9. conninfo=" conninfo " dat=" dat " runremote = " runremote) (mutex-unlock! *rmt-mutex*) (if success ;; success only tells us that the transport was successful, have to examine the data to see if there was a detected issue at the other end (if (and (vector? res) (eq? (vector-length res) 2) (eq? (vector-ref res 2) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (mutex-lock! *rmt-mutex*) (set! *runremote* #f) ;; force starting over (mutex-unlock! *rmt-mutex*) (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! runremote #f) (remote-server-url-set! runremote #f) |
︙ | ︙ |
Modified runs.scm from [ad868adae8] to [b7a198d7dd].
︙ | ︙ | |||
251 252 253 254 255 256 257 258 259 260 261 262 263 264 | (exit 4))))) (thread-start! th2) (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand)) (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process (set! runconf (if (file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (begin (debug:print 0 *default-log-port* "WARNING: You do not have a run config file: " runconfigf) #f))) | > > > > | 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 | (exit 4))))) (thread-start! th2) (thread-start! th1) (thread-join! th2))))) (set-signal-handler! signal/int sighand) (set-signal-handler! signal/term sighand)) ;; force the starting of a server (debug:print 0 *default-log-port* "waiting on server...") (server:start-and-wait *toppath*) (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process (set! runconf (if (file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (begin (debug:print 0 *default-log-port* "WARNING: You do not have a run config file: " runconfigf) #f))) |
︙ | ︙ | |||
1174 1175 1176 1177 1178 1179 1180 | waitons: waitons testmode: testmode newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) | > | | > | < | 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 | waitons: waitons testmode: testmode newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) ;; every 15 minutes verify the server is there for this run (if (and (common:low-noise-print 240 "try start server" run-id) (not (server:check-if-running *toppath*))) (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1))) ;; (debug:print 0 *default-log-port* "max-tries-hash: " (hash-table->alist *max-tries-hash*)) |
︙ | ︙ |
Modified server.scm from [0a5d68ff36] to [d6964e3100].
︙ | ︙ | |||
216 217 218 219 220 221 222 | ;; ;; mod-time host port start-time pid ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) | > | | | | | | | | | | | | | | | | | | | | | > > > | 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 | ;; ;; mod-time host port start-time pid ;; ;; sort by start-time descending. I.e. get the oldest first. Young servers will thus drop off ;; and servers should stick around for about two hours or so. ;; (define (server:get-best srvlst) (let* ((nums (server:get-num-servers)) (now (current-seconds)) (slst (sort (filter (lambda (rec) (if (and (list? rec) (> (length rec) 2)) (let ((start-time (list-ref rec 3)) (mod-time (list-ref rec 0))) ;; (print "start-time: " start-time " mod-time: " mod-time) (and start-time mod-time (> (- now start-time) 0) ;; been running at least 0 seconds (< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds (< (- now start-time) (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600")) 180) (random 360))) ;; under one hour running time +/- 180 )) #f)) srvlst) (lambda (a b) (< (list-ref a 3) (list-ref b 3)))))) (if (> (length slst) nums) (take slst nums) slst))) (define (server:get-first-best areapath) (let ((srvrs (server:get-best (server:get-list areapath)))) (if (and srvrs (not (null? srvrs))) (car srvrs) #f))) |
︙ | ︙ | |||
303 304 305 306 307 308 309 310 311 312 | (let ((num-ok (length (server:get-best (server:get-list areapath))))) (if (< num-ok 1) ;; if there are no decent candidates for servers then try starting a new one (server:kind-run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; | > > > > > | | < | 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | (let ((num-ok (length (server:get-best (server:get-list areapath))))) (if (< num-ok 1) ;; if there are no decent candidates for servers then try starting a new one (server:kind-run areapath)) (thread-sleep! 5) (loop (server:check-if-running areapath))))))) (define server:try-running server:run) ;; there is no more per-run servers ;; REMOVE ME. BUG. (define (server:get-num-servers #!key (numservers 2)) (let ((ns (string->number (or (configf:lookup *configdat* "server" "numservers") "notanumber")))) (or ns numservers))) ;; no longer care if multiple servers are started by accident. older servers will drop off in time. ;; (define (server:check-if-running areapath) ;; #!key (numservers "2")) (let* ((ns (server:get-num-servers)) (servers (server:get-best (server:get-list areapath)))) ;; (print "servers: " servers " ns: " ns) (if (or (and servers (null? servers)) (not servers) (and (list? servers) (< (length servers) (random ns)))) ;; somewhere between 0 and numservers |
︙ | ︙ |