Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -371,11 +371,13 @@ ((add-var) (apply db:add-var dbstruct params)) ((insert-run) (apply db:insert-run dbstruct params)) ;; STEPS - ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) + ((teststep-set-status!) + ;; (apply db:teststep-set-status! dbstruct params)) + (db:add-cached-write dbstruct db:teststep-set-status! run-id params)) ((delete-steps-for-test!) (apply db:delete-steps-for-test! dbstruct params)) ;; TEST DATA ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) ((csv->test-data) (apply db:csv->test-data dbstruct params)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -2739,12 +2739,10 @@ "INSERT OR REPLACE into test_steps (test_id,stepname,state,status,event_time,comment,logfile) VALUES(?,?,?,?,?,?,?);" test-id teststep-name state-in status-in (current-seconds) (if comment comment "") (if logfile logfile ""))))) - - (define (db:delete-steps-for-test! dbstruct run-id test-id) ;; TODO: figure out why status is the key field rather than state (note: CONSTRAINT test_steps_constraint UNIQUE (test_id,stepname,state) ) (db:with-db dbstruct run-id @@ -4247,10 +4245,56 @@ (conc (current-directory) "/" outputfile))) results) ;; brutal clean up (dbfile:add-dbdat dbstruct #f dbdat) (system "rm -rf tempdir"))) + +;;====================================================================== +;; cached writes stuff +;;====================================================================== + +(define (db:add-cached-write dbstruct proc run-id params) + (debug:print 0 *default-log-port* "Adding cached write for run-id "run-id" params " params) + (mutex-lock! *cached-writes-mutex*) + (let* ((hkey (cons dbstruct run-id)) + (cached-writes-queue (hash-table-ref/default *cached-writes-queues* hkey '()))) + (hash-table-set! *cached-writes-queues* hkey (cons (list proc params) cached-writes-queue))) + (if (not *cached-writes-flag*) + (begin + (set! *cached-writes-flag* #t) + (thread-start! (make-thread + (lambda () + (debug:print 0 *default-log-port* "process cached writes thread started.") + (thread-sleep! 1) + (db:process-cached-writes-queue)))))) + (mutex-unlock! *cached-writes-mutex*)) + +(define (db:process-cached-writes-queue) + (mutex-lock! *cached-writes-mutex*) + (hash-table-for-each + *cached-writes-queues* + (lambda (hkey writes-list) + (let* ((dbstruct (car hkey)) + (run-id (cdr hkey))) + (debug:print 0 *default-log-port* "Processing "(length writes-list)" cached writes for run "run-id) + (db:with-db + dbstruct + run-id + #t + (lambda (dbdat db) + (sqlite3:with-transaction + db + (lambda () + (for-each + (lambda (queued-write) + (match queued-write + ((proc params)(apply proc dbstruct params)) + (else (assert #f "BAD queued-write")))) + writes-list))) + (hash-table-delete! *cached-writes-queues* hkey)))))) + (set! *cached-writes-flag* #f) + (mutex-unlock! *cached-writes-mutex*)) ;; (db:extract-ods-file db "outputfile.ods" '(("sysname" "%")("fsname" "%")("datapath" "%")) "%") ;;====================================================================== ;; moving watch dogs here due to dependencies Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -1579,7 +1579,17 @@ ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) (mutex-unlock! *get-cache-stmth-mutex*) result)) + +;;====================================================================== +;; cached writes - run list of procs inside transaction +;; NOTE: this only works because we have once database per process +;;====================================================================== + +(define *cached-writes-mutex* (make-mutex)) +(define *cached-writes-flag* #f) +(define *cached-writes-queues* (make-hash-table)) ;; dbstruct->list of writes + ) Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -119,12 +119,14 @@ (loop (- count 1))) (begin (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, giving up.") (exit 1)))) (exn () - (dbfile:print-err exn "ERROR: Unknown error with database for run-id "run-id", message: " - ((condition-property-accessor 'exn 'message) exn)) + (dbfile:print-err exn "ERROR: Unknown error with db for run-id " + run-id", message: " + ((condition-property-accessor 'exn 'message) exn) + ", details: "(condition->list exn)) (exit 2)))))) (if use-mutex (mutex-unlock! *db-with-db-mutex*)) res))) (define (db:with-db dbstruct run-id w/r proc . params) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -72,34 +72,11 @@ ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") - - (if (not (eq? (rmt:transport-mode) 'nfs)) - (begin - (if (> attemptnum 2) - (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) - - (cond - ((> attemptnum 2) (thread-sleep! 0.05)) - ((> attemptnum 10) (thread-sleep! 0.5)) - ((> attemptnum 20) (thread-sleep! 1))) - - ;; I'm turning this off, it may make sense to move it - ;; into http-transport-handler - (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) - (begin - (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.") - (case (rmt:transport-mode) - ((http) - (server:run *toppath*) - (thread-sleep! 3)) - (else - (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server - )))))) - + ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -232,10 +232,11 @@ ;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; (define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe) (debug:print 2 *default-log-port* "tt:handler cmd: " cmd " run-id: " run-id " attemptnum: " attemptnum) ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now. + ;; connect-to-server will start a server if needed. (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f))) (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) @@ -260,11 +261,11 @@ (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) (else result))) (else ;; did not receive properly formated result - (if (not res) ;; tt:handler is telling us that communication failed + (if (not res) ;; tt:send-receive telling us that communication failed (let* ((host (tt-conn-host conn)) (port (tt-conn-port conn)) ;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db (pid (tt-conn-pid conn)) ;;(servinf (tt-conn-servinf-file conn))) @@ -293,11 +294,11 @@ ;; try again (thread-sleep! 0.25) ;; dunno, I think this needs to be here (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) )))) (begin ;; no server file, delay and try again - (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf) + (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ") (thread-sleep! 0.5) (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))) (begin ;; this case is where res is malformed. Probably should abort (assert #f "FATAL: tt:handler received bad data "res) ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.") @@ -476,11 +477,12 @@ ;; is there already a server for this dbfile? Then exit. (debug:print 2 *default-log-port* "tt:start-server: " dbfname-in) (let* ((ttdat (make-tt areapath: areapath)) (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead - (if (> (length servers) 4) + (debug:print 0 *default-log-port* "Found " (length servers) " already running for " dbfname) + (if (> (length servers) 0) (begin (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.") (exit)) (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) (tt-handler-set! ttdat (handler dbstruct)) @@ -537,12 +539,11 @@ (dbfile:with-no-sync-db nosyncdbpath (lambda (db) (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct))) (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) (db:no-sync-del! db dbfname) - #;(if dbtmpname - (delete-file dbtmpname)))))))) + )))))) (set! *server-info* ttdat) (let loop ((count 0)) (if (> count 240) (begin (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") @@ -585,10 +586,11 @@ (if (and res (common:low-noise-print 120 "top server message")) (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) res)) (else + ;; wrong servinfo file (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) (let* ((leadsrv (car servers))) (match leadsrv ((host port startseconds server-id pid dbfname servinfofile) (let* ((result (tt:timed-ping host port server-id))