Overview
Comment: | Add back and improve waiting for journal plus few other performance helpers |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.70 |
Files: | files | file ages | folders |
SHA1: |
76fac9ef301954903e31234b7af1e9bc |
User & Date: | matt on 2022-09-18 18:12:21 |
Other Links: | branch diff | manifest | tags |
Context
2022-09-18
| ||
19:31 | Added brute-force-salvage-db back. It seems to work, at least sometimes. check-in: fb8df28768 user: matt tags: v1.70 | |
18:12 | Add back and improve waiting for journal plus few other performance helpers check-in: 76fac9ef30 user: matt tags: v1.70 | |
12:50 | Moved close-all-connections! to better spot. check-in: a847a95748 user: matt tags: v1.70 | |
Changes
Modified api.scm from [736048365d] to [3289b67f67].
︙ | ︙ | |||
151 152 153 154 155 156 157 | (db:open-no-sync-db) ;; sets *no-sync-db* ;; (handle-exceptions ;; exn ;; (let ((call-chain (get-call-chain))) ;; (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn) ;; (print-call-chain (current-error-port)) ;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) | | > > > > | | 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | (db:open-no-sync-db) ;; sets *no-sync-db* ;; (handle-exceptions ;; exn ;; (let ((call-chain (get-call-chain))) ;; (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat ", exn=" exn) ;; (print-call-chain (current-error-port)) ;; (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens (if (> *api-process-request-count* 200) (begin (debug:print 0 *default-log-port* "WARNING: Over 200 threads, overload, taking a five second nap.") (thread-sleep! 5))) ;; take a nap (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) #;((> *api-process-request-count* 200) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in |
︙ | ︙ |
Modified db.scm from [974c310e18] to [131d871139].
︙ | ︙ | |||
655 656 657 658 659 660 661 | (tmp-area (common:get-db-tmp-area)) (dbfiles (glob (conc tmp-area"/.megatest/*.db"))) (sync-durations (make-hash-table)) (no-sync-db (db:open-no-sync-db))) (for-each (lambda (file) (debug:print-info 3 *default-log-port* "file: " file) | | | | | | | | | | | | > | > | | | < | > | < > | < > | | | | | < | | < | 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 | (tmp-area (common:get-db-tmp-area)) (dbfiles (glob (conc tmp-area"/.megatest/*.db"))) (sync-durations (make-hash-table)) (no-sync-db (db:open-no-sync-db))) (for-each (lambda (file) (debug:print-info 3 *default-log-port* "file: " file) (let* ((fname (conc (pathname-file file) ".db")) (fulln (conc *toppath*"/.megatest/"fname)) (time1 (if (file-exists? file) (file-modification-time file) (begin (debug:print-info 2 *default-log-port* "Sync - I do not see file "file) 1))) (time2 (if (file-exists? fulln) (file-modification-time fulln) (begin (debug:print-info 2 *default-log-port* "Sync - I do not see file "fulln) 0))) (changed (> (- time1 time2) (+ (random 5) 1))) ;; it has been at some few seconds since last synced (changed10 (> (- time1 time2) 10)) ;; it has been at least ten seconds since sync'd (jfile-exists (file-exists? (conc file"-journal"))) ;; i.e. are we busy? (do-cp (cond ((not (file-exists? fulln)) ;; shouldn't happen, but this might recover (cons #t (conc "File "fulln" not found! Copying "fname" to "fulln))) ((and (not jfile-exists) changed) (cons #t "not busy, changed")) ;; not busy and changed ((and jfile-exists changed10) (cons #t "busy but not synced in a while")) ;; busy but not sync'd in over 10 seconds ((and changed *time-to-exit*) (cons #t "Time to exit, forced final sync")) ;; last sync (else (cons #f "No sync needed"))))) (if (car do-cp) (let* ((start-time (current-milliseconds)) (fname (pathname-file file)) (runid (if (string= fname "main") #f (string->number fname)))) (debug:print-info 3 *default-log-port* "db:all-db-sync: fname: " fname", delta: " (- time1 time2) " seconds, reason: "(cdr do-cp)) (db:lock-and-delta-sync no-sync-db dbstruct fname runid (db:get-keys dbstruct) db:initialize-main-db) (hash-table-set! sync-durations (conc fname".db") (- (current-milliseconds) start-time))) (debug:print-info 3 *default-log-port* "skipping sync. " file " is up to date") ))) dbfiles) (if dbdat (dbfile:add-dbdat dbstruct #f dbdat))) #t) ;; options: ;; ;; 'killservers - kills all servers ;; 'dejunk - removes junk records |
︙ | ︙ | |||
1861 1862 1863 1864 1865 1866 1867 | (contour (or contour-in "")) ;; empty string to force no hierarcy and be backwards compatible. (comma (if (> (length keys) 0) "," "")) (andstr (if (> (length keys) 0) " AND " "")) (valslots (keys->valslots keys)) ;; ?,?,? ... (allvals (append (list runname state status user contour) (map cadr keyvals))) (qryvals (append (list runname) (map cadr keyvals))) (key=?str (string-intersperse (map (lambda (k)(conc k "=?")) keys) " AND "))) | | | > | > | 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 | (contour (or contour-in "")) ;; empty string to force no hierarcy and be backwards compatible. (comma (if (> (length keys) 0) "," "")) (andstr (if (> (length keys) 0) " AND " "")) (valslots (keys->valslots keys)) ;; ?,?,? ... (allvals (append (list runname state status user contour) (map cadr keyvals))) (qryvals (append (list runname) (map cadr keyvals))) (key=?str (string-intersperse (map (lambda (k)(conc k "=?")) keys) " AND "))) ;; (debug:print 0 *default-log-port* "Got here 0.") (debug:print 3 *default-log-port* "keys: " keys " allvals: " allvals " keyvals: " keyvals " key=?str is " key=?str) (debug:print 2 *default-log-port* "NOTE: using target " (string-intersperse (map cadr keyvals) "/") " for this run") (if (and runname (null? (filter (lambda (x)(not x)) keyvals))) ;; there must be a better way to "apply and" (db:with-db dbstruct #f #f (lambda (dbdat db) ;; (debug:print 0 *default-log-port* "Got here 1.") (let ((res #f)) (apply sqlite3:execute db (conc "INSERT OR IGNORE INTO runs (runname,state,status,owner,event_time,contour" comma keystr ") VALUES (?,?,?,?,strftime('%s','now'),?" comma valslots ");") allvals) (apply sqlite3:for-each-row (lambda (id) (set! res id)) db (let ((qry (conc "SELECT id FROM runs WHERE (runname=? " andstr key=?str ");"))) qry) |
︙ | ︙ |
Modified dbfile.scm from [769f7dede8] to [3ea5a94d80].
︙ | ︙ | |||
467 468 469 470 471 472 473 | (dbfile:print-err "INFO: db:sync-all-tables-list done.") ) (dbfile:print-err " db, " (dbr:dbdat-dbfile tmpdb) " already exists or fresh enough, not propogating data from\n " (dbr:dbdat-dbfile mtdb) " mod time delta: " modtimedelta) ) ;; (db:multi-db-sync subdb 'old2new)) ;; migrate data from megatest.db automatically tmpdb)) | | > > > > > > > > > > > > > > | 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 | (dbfile:print-err "INFO: db:sync-all-tables-list done.") ) (dbfile:print-err " db, " (dbr:dbdat-dbfile tmpdb) " already exists or fresh enough, not propogating data from\n " (dbr:dbdat-dbfile mtdb) " mod time delta: " modtimedelta) ) ;; (db:multi-db-sync subdb 'old2new)) ;; migrate data from megatest.db automatically tmpdb)) (define (dbfile:cautious-open-database fname init-proc #!optional (tries-left 500)) (let* ((busy-file (conc fname"-journal")) (delay-time (* (- 51 tries-left) 1.1)) (write-access (file-write-access? fname)) (dir-access (file-write-access? (pathname-directory fname))) (retry (lambda () (thread-sleep! delay-time) (if (> tries-left 0) (dbfile:cautious-open-database fname init-proc (- tries-left 1)))))) (assert (>= tries-left 0) (conc "FATAL: too many attempts in dbfile:cautious-open-database of "fname", giving up.")) (if (and (file-write-access? fname) (file-exists? busy-file)) (begin (if (common:low-noise-print 120 busy-file) (dbfile:print-err "INFO: dbfile:cautious-open-database: journal file " busy-file" exists, trying again in few seconds.")) (thread-sleep! 1) ;; (if (eq? tries-left 2) ;; (begin ;; (dbfile:print-err "INFO: forcing journal rollup "busy-file) ;; (dbfile:brute-force-salvage-db fname))) (dbfile:cautious-open-database fname init-proc (- tries-left 1))) (let* ((result (condition-case (if dir-access (dbfile:with-simple-file-lock (conc fname ".lock") (lambda () (let* ((db-exists (file-exists? fname)) |
︙ | ︙ | |||
515 516 517 518 519 520 521 | (retry)) (exn (permission)(dbfile:print-err exn "ERROR: database " fname " has some permissions problem.") (retry)) (exn () (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)) (retry))))) | | | 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 | (retry)) (exn (permission)(dbfile:print-err exn "ERROR: database " fname " has some permissions problem.") (retry)) (exn () (dbfile:print-err exn "ERROR: Unknown error with database " fname " message: " ((condition-property-accessor 'exn 'message) exn)) (retry))))) result)))) (define (dbfile:brute-force-salvage-db fname) (let* ((backupfname (conc fname"-"(current-process-id)".bak")) (cmd (conc "cp "fname" "backupfname";mv "fname" "(conc fname ".delme;") "cp "backupfname" "fname))) (dbfile:print-err "WARNING: attempting recovery of file "fname" by running commands:\n" " "cmd) |
︙ | ︙ | |||
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 | #f)) (db (if have-struct ;; this stuff just allows us to call with a db handle directly (dbr:dbdat-dbh dbdat) dbstruct)) (fname (if dbdat (dbr:dbdat-dbfile dbdat) "nofilenameavailable")) #;(subdb (if have-struct (dbfile:get-subdb dbstruct run-id) #f)) (use-mutex (> *api-process-request-count* 25))) ;; was 25 (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) | > > > > > | > | | 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 | #f)) (db (if have-struct ;; this stuff just allows us to call with a db handle directly (dbr:dbdat-dbh dbdat) dbstruct)) (fname (if dbdat (dbr:dbdat-dbfile dbdat) "nofilenameavailable")) (jfile (conc fname"-journal")) #;(subdb (if have-struct (dbfile:get-subdb dbstruct run-id) #f)) (use-mutex (> *api-process-request-count* 25))) ;; was 25 (if (file-exists? jfile) (begin (dbfile:print-err "INFO: "jfile" exists, delaying few seconds to reduce database load") (thread-sleep! 2))) (if (and use-mutex (common:low-noise-print 120 "over-50-parallel-api-requests")) (dbfile:print-err *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access")) (condition-case (begin (if use-mutex (mutex-lock! *db-with-db-mutex*)) (let ((res (apply proc dbdat db params))) ;; the actual call is here. (if use-mutex (mutex-unlock! *db-with-db-mutex*)) ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w)) (if dbdat (dbfile:add-dbdat dbstruct run-id dbdat)) res)) (exn (io-error) (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.")) |
︙ | ︙ |
Added utils/open-files.sh version [61001a23e2].
> > > | 1 2 3 | echo "Database opens: $(lsof -c mtest|egrep '.*db$'|wc -l)" echo "Logfile opens: $(lsof -c mtest|egrep '.*log$'|wc -l)" echo "TCP connections: $(lsof -c mtest|grep TCP|wc -l)" |