Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -246,13 +246,13 @@ (pid (tt-conn-pid conn)) (servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) (hash-table-set! (tt-conns ttdat) dbfname #f) (if (file-exists? servinf) (begin - (if (< attemptnum 5) + (if (< attemptnum 3) (begin - (thread-sleep! 1) + (thread-sleep! 0.25) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (begin (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname) (if (and (file-exists? servinf) (> (- (current-seconds)(file-modification-time servinf)) 60)) @@ -260,11 +260,17 @@ (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.") (handle-exceptions exn #f (delete-file* servinf)) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))))) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + (begin + ;; start server - addressed in client-connect-to-server + ;; delay - addressed in client-connect-to-server + ;; try again + (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + )))) (begin ;; no server file, delay and try again (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf) (thread-sleep! 1) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))) (begin ;; this case is where res is malformed. Probably should abort @@ -401,17 +407,22 @@ (tt:backoff-incr host port) #f) (exn (i/o net) (if ping-mode #f - (if (>= tries-remaining 0) - (let* ((backoff-delay (* (- 26 tries-remaining) 0.1))) - (debug:print 0 *default-log-port* "WARNING: TCP overload, trying again in "backoff-delay"s.") - (thread-sleep! backoff-delay) - (tt:backoff-incr host port) - (retry)) - (assert #f "FATAL: Too many retries in tt:send-receive-direct")))) + (cond + ((> tries-remaining 4) ;; server likely defunct + (tt:backoff-incr host port) + #f) + ((>= tries-remaining 0) + (let* ((backoff-delay (* (- 26 tries-remaining) 0.1))) + (debug:print 0 *default-log-port* "WARNING: TCP overload, trying again in "backoff-delay"s.") + (thread-sleep! backoff-delay) + (tt:backoff-incr host port) + (retry)) + (assert #f "FATAL: Too many retries in tt:send-receive-direct")) + (else #f)))) (exn () (full-err-print exn "Unhandled exception from client side.") #f)))) @@ -650,11 +661,11 @@ (bad-dat (list #f #f #f #f #f #f logf))) (let ((fdat (handle-exceptions exn (begin ;; WARNING: this is potentially dangerous to blanket ignore the errors - (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn=" exn) + (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn)) '()) ;; no idea what went wrong, call it a bad server, return empty list (with-input-from-file logf read-lines)))) (if (null? fdat) ;; bad data, return bad-dat bad-dat (let loop ((inl (car fdat)) @@ -744,12 +755,18 @@ (exn-result #f) (stdout-result (with-output-to-string (lambda () (let ((res (handle-exceptions exn - (begin - (set! exn-result (condition->list exn)) + (let* ((errdat (condition->list exn))) + (set! exn-result errdat) + (debug:print 0 *default-log-port* "ERROR: handler exception, these are bad, will exit in five seconds.") + (pp errdat *default-log-port*) + ;; these are always bad, set up an exit thread + (thread-start! (make-thread (lambda () + (thread-sleep! 5) + (exit)))) #f) (handler indat)))) (set! result res))))) (full-result (list result exn-result (if (equal? stdout-result "") #f stdout-result)))) (handle-exceptions