Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -1305,34 +1305,45 @@ (define (dbfile:simple-file-lock fname #!key (expire-time 300)) (let ((fmod-time (handle-exceptions ext (current-seconds) (file-modification-time fname)))) + + ;; if the file exists, if it has expired, delete it and call this function recursively. (if (file-exists? fname) (if (> (- (current-seconds) fmod-time) expire-time) (begin + (dbfile:print-err "simple-file-lock: removing expired file: " fname) (handle-exceptions exn #f (delete-file* fname)) (dbfile:simple-file-lock fname expire-time: expire-time)) - #f) - (let ((key-string (conc (get-host-name) "-" (current-process-id))) + #f + ) + + ;; If it doesn't exist, write the host name and process id to the file + (let ((key-string (conc (get-host-name) "-" (current-process-id) ": " (argv))) (oup (open-output-file fname))) (with-output-to-port oup (lambda () (print key-string))) (close-output-port oup) - #;(with-output-to-file fname ;; bizarre. with-output-to-file does not seem to be cleaning up after itself. - (lambda () - (print key-string))) + + + ;; sleep 3 seconds and make sure it still exists and contains the same host/process id. + ;; if not, return #f (thread-sleep! 0.25) (if (file-exists? fname) (handle-exceptions exn #f (with-input-from-file fname (lambda () (equal? key-string (read-line))))) - #f) + (begin + (dbfile:print-err "dbfile:simple-file-lock created " fname " but it was gone 3 seconds later") + #f + ) + ) ) ) ) ) @@ -1352,21 +1363,26 @@ exn #f ;; I don't really care why this failed (at least for now) (delete-file* fname))) (define (dbfile:with-simple-file-lock fname proc #!key (expire-time 300)) - (let ((gotlock (dbfile:simple-file-lock-and-wait fname expire-time: expire-time))) + (let ((start-time (current-seconds)) + (gotlock (dbfile:simple-file-lock-and-wait fname expire-time: expire-time)) + (end-time (current-seconds)) + ) (if gotlock (let ((res (proc))) (dbfile:simple-file-release-lock fname) res) (begin (dbfile:print-err "dbfile:with-simple-file-lock: " fname " is locked by " ) (with-input-from-file fname (lambda () (dbfile:print-err (read-line)))) - (assert #f (conc "ERROR: simple file lock could not get a lock for " fname " in " expire-time " seconds")) + (dbfile:print-err "wait time = " (- end-time start-time)) + (dbfile:print-err "ERROR: simple file lock could not get a lock for " fname " in " expire-time " seconds") + #f ) ) ) ) Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -604,14 +604,14 @@ (let* ((dbfname (dbmod:run-id->dbfname run-id)) (load (get-normalized-cpu-load)) (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) (cond ((> load 2.0) - (debug:print 0 *default-log-port* "Normalized load "load" is over the limit of 2.0. Not starting a server.") + (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.") (thread-sleep! 1)) ((> nrun 100) - (debug:print 0 *default-log-port* nrun" servers running on this host, not starting another.") + (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") (thread-sleep! 1)) (else (if (not (file-exists? (conc areapath"/logs"))) (create-directory (conc areapath"/logs") #t)) (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log"))