Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -295,12 +295,17 @@ (server-dat (if runremote (remote-conndat runremote) #f))) ;; (hash-table-ref/default *runremote* run-id #f))) (if (vector? server-dat) (let ((api-dat (http-transport:server-dat-get-api-uri server-dat))) - (close-connection! api-dat) - #t) + (handle-exceptions + exn + (begin + (print-call-chain *default-log-port*) + (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn))) + (close-connection! api-dat) + #t)) #f))) (define (make-http-transport:server-dat)(make-vector 6)) (define (http-transport:server-dat-get-iface vec) (vector-ref vec 0)) @@ -434,21 +439,21 @@ server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond ((and *server-run* - (not *server-overloaded*) (> (+ last-access server-timeout) (current-seconds)) (< (- (current-seconds) server-start-time) 3600)) ;; do not update log or touch log if we've been running for more than one hour. (if (common:low-noise-print 120 "server continuing") (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk?") - (change-file-times server-log-file curr-time curr-time)))) + (if (not *server-overloaded*) + (change-file-times server-log-file curr-time curr-time))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6416) +(define megatest-version 1.6417) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -133,17 +133,11 @@ ((and runremote (remote-conndat runremote) (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #.\n message: Server closed connection before sending response" (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - (let ((old-conn (remote-conndat runremote))) - (handle-exceptions - exn - (begin - (print-call-chain *default-log-port*) - (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn))) - (http-transport:close-connections area-dat: runremote))) + (http-transport:close-connections area-dat: runremote) (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. (mutex-unlock! *rmt-mutex*) (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;;DOT CASE5 [label="local\nread"]; @@ -256,22 +250,24 @@ (eq? (vector-length res) 2) (eq? (vector-ref res 1) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (mutex-lock! *rmt-mutex*) + (http-transport:close-connections area-dat: runremote) (set! *runremote* #f) ;; force starting over (mutex-unlock! *rmt-mutex*) (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! runremote #f) + (http-transport:close-connections area-dat: runremote) (remote-server-url-set! runremote #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") - (if (not (server:check-if-running *toppath*)) - (server:start-and-wait *toppath*)) + ;; (if (not (server:check-if-running *toppath*)) + ;; (server:start-and-wait *toppath*)) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) ;;DOT } ;; (define (rmt:update-db-stats run-id rawcmd params duration) Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -1269,11 +1269,14 @@ ))) (runs:dat-regfull-set! runsdat regfull) ;; every 15 minutes verify the server is there for this run (if (and (common:low-noise-print 240 "try start server" run-id) - (not (server:check-if-running *toppath*))) + (not (or (and *runremote* + (remote-server-url *runremote*) + (server:ping (remote-server-url *runremote*))) + (server:check-if-running *toppath*)))) (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -195,11 +195,11 @@ (system (conc "nbfake kill "kill-switch" "pid)) (when logfile (thread-sleep! 0.5) (if (file-exists? gzfile) (delete-file gzfile)) - (system (conc "gzip "logfile)) + (system (conc "gzip " logfile)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST"))))