Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -153,15 +153,19 @@ ;; exn ;; (let ((call-chain (get-call-chain))) ;; (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn) ;; (print-call-chain (current-error-port)) ;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) -;; (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens + ;; (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens + (if (> *api-process-request-count* 200) + (begin + (debug:print 0 *default-log-port* "WARNING: Over 200 threads, overload, taking a five second nap.") + (thread-sleep! 5))) ;; take a nap (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) - ((> *api-process-request-count* 200) ;; 20) + #;((> *api-process-request-count* 200) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -657,48 +657,48 @@ (sync-durations (make-hash-table)) (no-sync-db (db:open-no-sync-db))) (for-each (lambda (file) (debug:print-info 3 *default-log-port* "file: " file) - (let* ((fname (conc (pathname-file file) ".db")) - (fulln (conc *toppath*"/.megatest/"fname)) - (time1 (if (file-exists? file) - (file-modification-time file) - (begin - (debug:print-info 2 *default-log-port* "Sync - I do not see file "file) + (let* ((fname (conc (pathname-file file) ".db")) + (fulln (conc *toppath*"/.megatest/"fname)) + (time1 (if (file-exists? file) + (file-modification-time file) + (begin + (debug:print-info 2 *default-log-port* "Sync - I do not see file "file) 1))) - (time2 (if (file-exists? fulln) - (file-modification-time fulln) - (begin - (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln) - 0))) - (changed (> time1 time2)) - (do-cp (cond - ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover - (debug:print-info 2 *default-log-port* "File "fulln" not found! Copying "fname" to "fulln) - #t) - (changed ;; (and changed - ;; (> (- (current-seconds) time1) 3)) ;; if file is changed and three seconds have passed. - #t) - ((and changed *time-to-exit*) ;; last sync - #t) - (else - #f)))) - (if do-cp + (time2 (if (file-exists? fulln) + (file-modification-time fulln) + (begin + (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln) + 0))) + (changed (> (- time1 time2) (+ (random 5) 1))) ;; it has been at some few seconds since last synced + (changed10 (> (- time1 time2) 10)) ;; it has been at least ten seconds since sync'd + (jfile-exists (file-exists? (conc file"-journal"))) ;; i.e. are we busy? + (do-cp (cond + ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover + (cons #t (conc "File "fulln" not found! Copying "fname" to "fulln))) + ((and (not jfile-exists) changed) + (cons #t "not busy, changed")) ;; not busy and changed + ((and jfile-exists changed10) + (cons #t "busy but not synced in a while")) ;; busy but not sync'd in over 10 seconds + ((and changed *time-to-exit*) + (cons #t "Time to exit, forced final sync")) ;; last sync + (else + (cons #f "No sync needed"))))) + (if (car do-cp) (let* ((start-time (current-milliseconds)) - (fname (pathname-file file)) - (runid (if (string= fname "main") #f (string->number fname))) - ) + (fname (pathname-file file)) + (runid (if (string= fname "main") #f (string->number fname)))) (debug:print-info 3 *default-log-port* "db:all-db-sync: fname: " - fname", delta: " (- time1 time2) " seconds") + fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp)) (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db) (hash-table-set! sync-durations (conc fname".db") (- (current-milliseconds) start-time))) (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date") ))) - dbfiles - ) + dbfiles) (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))) #t) ;; options: ;; @@ -1863,20 +1863,22 @@ (andstr (if (> (length keys) 0) " AND " "")) (valslots (keys->valslots keys)) ;; ?,?,? ... (allvals (append (list runname state status user contour) (map cadr keyvals))) (qryvals (append (list runname) (map cadr keyvals))) (key=?str (string-intersperse (map (lambda (k)(conc k "=?")) keys) " AND "))) - (debug:print 0 *default-log-port* "Got here 0.") + ;; (debug:print 0 *default-log-port* "Got here 0.") (debug:print 3 *default-log-port* "keys: " keys " allvals: " allvals " keyvals: " keyvals " key=?str is " key=?str) (debug:print 2 *default-log-port* "NOTE: using target " (string-intersperse (map cadr keyvals) "/") " for this run") (if (and runname (null? (filter (lambda (x)(not x)) keyvals))) ;; there must be a better way to "apply and" (db:with-db dbstruct #f #f (lambda (dbdat db) - (debug:print 0 *default-log-port* "Got here 1.") + ;; (debug:print 0 *default-log-port* "Got here 1.") (let ((res #f)) - (apply sqlite3:execute db (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour" comma keystr ") VALUES (?,?,?,?,strftime('%s','now'),?" comma valslots ");") + (apply sqlite3:execute db + (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour" + comma keystr ") VALUES (?,?,?,?,strftime('%s','now'),?" comma valslots ");") allvals) (apply sqlite3:for-each-row (lambda (id) (set! res id)) db Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -469,20 +469,34 @@ (dbfile:print-err " db, " (dbr:dbdat-dbfile tmpdb) " already exists or fresh enough, not propogating data from\n " (dbr:dbdat-dbfile mtdb) " mod time delta: " modtimedelta) ) ;; (db:multi-db-sync subdb 'old2new)) ;; migrate data from megatest.db automatically tmpdb)) -(define (dbfile:cautious-open-database fname init-proc #!optional (tries-left 50)) +(define (dbfile:cautious-open-database fname init-proc #!optional (tries-left 500)) (let* ((busy-file (conc fname"-journal")) (delay-time (* (- 51 tries-left) 1.1)) (write-access (file-write-access? fname)) (dir-access (file-write-access? (pathname-directory fname))) (retry (lambda () (thread-sleep! delay-time) (if (> tries-left 0) (dbfile:cautious-open-database fname init-proc (- tries-left 1)))))) + (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up.")) + + (if (and (file-write-access? fname) + (file-exists? busy-file)) + (begin + (if (common:low-noise-print 120 busy-file) + (dbfile:print-err "INFO: dbfile:cautious-open-database: journal file " + busy-file" exists, trying again in few seconds.")) + (thread-sleep! 1) + ;; (if (eq? tries-left 2) + ;; (begin + ;; (dbfile:print-err "INFO: forcing journal rollup "busy-file) + ;; (dbfile:brute-force-salvage-db fname))) + (dbfile:cautious-open-database fname init-proc (- tries-left 1))) (let* ((result (condition-case (if dir-access (dbfile:with-simple-file-lock (conc fname ".lock") @@ -517,11 +531,11 @@ (retry)) (exn () (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)) (retry))))) - result))) + result)))) (define (dbfile:brute-force-salvage-db fname) (let* ((backupfname (conc fname"-"(current-process-id)".bak")) (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;") "cp "backupfname" "fname))) @@ -1171,21 +1185,27 @@ (dbr:dbdat-dbh dbdat) dbstruct)) (fname (if dbdat (dbr:dbdat-dbfile dbdat) "nofilenameavailable")) + (jfile (conc fname"-journal")) #;(subdb (if have-struct (dbfile:get-subdb dbstruct run-id) #f)) (use-mutex (> *api-process-request-count* 25))) ;; was 25 + (if (file-exists? jfile) + (begin + (dbfile:print-err "INFO: "jfile" exists, delaying few seconds to reduce database load") + (thread-sleep! 2))) (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) - (dbfile:print-err *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access")) + (dbfile:print-err *api-process-request-count* " parallel api requests being processed in process " + (current-process-id) ", throttling access")) (condition-case (begin (if use-mutex (mutex-lock! *db-with-db-mutex*)) - (let ((res (apply proc dbdat db params))) + (let ((res (apply proc dbdat db params))) ;; the actual call is here. (if use-mutex (mutex-unlock! *db-with-db-mutex*)) ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w)) (if dbdat (dbfile:add-dbdat dbstruct run-id dbdat)) res)) ADDED utils/open-files.sh Index: utils/open-files.sh ================================================================== --- /dev/null +++ utils/open-files.sh @@ -0,0 +1,3 @@ +echo "Database opens: $(lsof -c mtest|egrep '.*db$'|wc -l)" +echo "Logfile opens: $(lsof -c mtest|egrep '.*log$'|wc -l)" +echo "TCP connections: $(lsof -c mtest|grep TCP|wc -l)"