Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -208,29 +208,38 @@ (dbr:dbstruct-sync-proc-set! dbstruct (lambda (last-update) (if *sync-in-progress* (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk") (let* ((sync-cmd (conc "megatest -db2db -from "tmpdb" -to "dbfullname"&")) - (synclock-file (conc dbfullname".sync-lock"))) - ;; (mutex-lock! *db-with-db-mutex*) ;; this mutex is used when overloaded or during a query that modifies the db - ;; (if (eq? (dbfile:cache-method) 'cachedb) - ;; (dbmod:sync-gasket tables last-update cachedb db - ;; dbfullname syncdir keys) + (synclock-file (conc dbfullname".sync-lock")) + (synclock-mod-time (if (file-exists? synclock-file) + (handle-exceptions + exn + #f + (file-modification-time synclock-file)) + #f)) + (thethread (lambda () + (thread-start! + (make-thread + (lambda () + (set! *sync-in-progress* #t) + (debug:print-info "Running "sync-cmd) + (system sync-cmd) + (set! *sync-in-progress* #f))))))) (if (< (file-modification-time tmpdb) (file-modification-time dbfullname)) (debug:print 0 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname) - (if (file-exists? synclock-file) - (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found.") - (thread-start! (make-thread - (lambda () - (set! *sync-in-progress* #t) - (debug:print-info "Running "sync-cmd) - (system sync-cmd) - ;; (mutex-unlock! *db-with-db-mutex*) - ;; (thread-sleep! 0.5) ;; ensure at least 1/2 second down time between sync calls - (set! *sync-in-progress* #f)))))) - )))) + (if synclock-mod-time + (if (< (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file + (begin + (handle-exceptions + exn + #f + (delete-file synclock-file)) + (thethread)) + (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found.")) + (thethread))))))) ;; (dbmod:sync-tables tables #f db cachedb) ;; (if db (dbmod:sync-gasket tables #f cachedb db dbfullname 'fromdest keys) ;; ) ;; load into cachedb (dbr:dbstruct-last-update-set! dbstruct (current-seconds)) ;; should this be offset back in time by one second? dbstruct)) Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -162,11 +162,11 @@ pid: pid))) ;; verify we can talk to this server (let* ((result (tt:timed-ping host port server-id)) (ping-res (car result)) (ping (cdr result))) - (debug:print-info 0 *default-log-port* "ping time:" ping) + (debug:print-info 0 *default-log-port* "ping time: " ping) (case ping-res ((running) (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good? conn) ((starting) @@ -734,18 +734,22 @@ (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.") (assert mtexe "FATAL: tt:server-process-run called without mtexe defined.") ;; mtest -server - -m testsuite:ext-tests -db 6.db (let* ((dbfname (dbmod:run-id->dbfname run-id)) (load (get-normalized-cpu-load)) + (trying (length (tt:find-server areapath dbfname))) (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) (cond ((> load 2.0) (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.") (thread-sleep! 1)) ((> nrun 100) (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") (thread-sleep! 1)) + ((> trying 4) + (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") + (thread-sleep! 1)) (else (if (not (file-exists? (conc areapath"/logs"))) (create-directory (conc areapath"/logs") #t)) (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) (cmdln (conc