Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -181,12 +181,18 @@ (server-start-proc) (tt-last-serv-start-set! ttdat (current-seconds)))) (thread-sleep! 1) (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) -(define (tt:ping host port server-id) - (let* ((res (tt:send-receive-direct host port `(ping #f #f #f)))) ;; please send me your server-id +(define (tt:ping host port server-id #!optional (tries-left 5)) + (let* ((res (tt:send-receive-direct host port `(ping #f #f #f))) ;; please send me your server-id + (try-again (lambda () + (if (> tries-left 0) + (begin + (thread-sleep! 1) + (tt:ping host port server-id (- tries-left 1))) + #f)))) ;; ;; need two threads, one a 5 second timer ;; (match res ((status errmsg result meta) @@ -197,11 +203,11 @@ (begin (debug:print 0 *default-log-port* "WARNING: server-id does not match, expected: "server-id", got: "result) #f))) (else ;; (debug:print 0 *default-log-port* "res not in form (status errmsg result meta), got: "res) - #f)))) + (try-again))))) ;; client side handler ;; ;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; @@ -230,34 +236,46 @@ (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, will try again in a 1/4 second.") (thread-sleep! 0.25) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) - (else - (if (not res) + (else ;; did not receive properly formated result + (if (not res) ;; tt:handler is telling us that communication failed (let* ((host (tt-conn-host conn)) (port (tt-conn-port conn)) ;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db (pid (tt-conn-pid conn)) (servinf (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) (hash-table-set! (tt-conns ttdat) dbfname #f) (if (file-exists? servinf) (begin - (debug:print 0 *default-log-port* "INFO: no ping response from server "host":"port" for "dbfname) - (if (and (file-exists? servinf) - (> (- (current-seconds)(file-modification-time servinf)) 60)) - (begin - (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.") - (handle-exceptions - exn - #f - (delete-file* servinf))))) - (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf)) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) - (assert #f "FATAL: tt:handler received bad data "res))))) - (begin - (thread-sleep! 1) ;; give it a rest and try again + (if (< attemptnum 5) + (begin + (thread-sleep! 1) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + (begin + (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname) + (if (and (file-exists? servinf) + (> (- (current-seconds)(file-modification-time servinf)) 60)) + (begin + (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.") + (handle-exceptions + exn + #f + (delete-file* servinf)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))))) + (begin ;; no server file, delay and try again + (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf) + (thread-sleep! 1) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))) + (begin ;; this case is where res is malformed. Probably should abort + (assert #f "FATAL: tt:handler received bad data "res) + ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.") + ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe) + ))))) + (begin + (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))))) (define (tt:bid-for-servership run-id) #f) ADDED utils/load-the-db.scm Index: utils/load-the-db.scm ================================================================== --- /dev/null +++ utils/load-the-db.scm @@ -0,0 +1,30 @@ +;; start the repl and then load this file + +(define start-time (current-seconds)) + +(let loop ((last-print 0) + (num-calls 0)) + (let ((all-run-ids (rmt:get-all-run-ids)) + (do-print (> (- (current-seconds) last-print) 2)) + (max-query 0)) + (for-each + (lambda (run-id) + ;; (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) + (let* ((all-run-data (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f #f #f))) + (for-each + (lambda (testdat) + (let* ((test-id (vector-ref testdat 0)) + (start-at (current-milliseconds)) + (testinfo (rmt:get-test-info-by-id run-id test-id)) + (query-time (- (current-milliseconds) start-at))) + (if (> query-time max-query) + (set! max-query query-time)))) + all-run-data) + (if do-print + (print "Running "(- (current-seconds) start-time)"s, run "run-id" has "(length all-run-data)" tests, max query "max-query)))) + all-run-ids) + (loop (if do-print + (current-seconds) + last-print) + (+ num-calls (length all-run-ids))))) +