Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -36,31 +36,30 @@ subrun.scm archive.scm env.scm \ diff-report.scm cgisetup/models/pgdb.scm # module source files MSRCFILES = dbfile.scm debugprint.scm mtargs.scm commonmod.scm dbmod.scm \ - tcp-transportmod.scm rmtmod.scm portlogger.scm + tcp-transportmod.scm rmtmod.scm portlogger.scm apimod.scm transport-mode.scm : transport-mode.scm.template - @if [[ -e transport-mode.scm ]];then \ - echo "WARNING: transport-mode.scm.template is newer than transport-mode.scm"; else \ - cp transport-mode.scm.template transport-mode.scm; fi + cp transport-mode.scm.template transport-mode.scm dashboard-transport-mode.scm : dashboard-transport-mode.scm.template - @if [[ -e dashboard-transport-mode.scm ]];then \ - echo "WARNING: dashboard-transport-mode.scm.template is newer than dashboard-transport-mode.scm"; else \ - cp dashboard-transport-mode.scm.template dashboard-transport-mode.scm; fi + cp dashboard-transport-mode.scm.template dashboard-transport-mode.scm -megatest.scm : transport-mode.scm -dashboard.scm : dashboard-transport-mode.scm +mtest : transport-mode.scm +dboard : dashboard-transport-mode.scm # dbmod.import.o is just a hack here mofiles/portlogger.o : mofiles/dbmod.o mofiles/dbfile.o : \ - mofiles/debugprint.o mofiles/commonmod.o - + mofiles/debugprint.o mofiles/commonmod.o +mofiles/apimod.o : mofiles/commonmod.o mofiles/tcp-transportmod.o +mofiles/dbmod.o : mofiles/dbfile.o +mofiles/api.o : mofiles/apimod.o +mofiles/commonmod.o : mofiles/debugprint.o configf.o : commonmod.import.o mofiles/dbfile.o : mofiles/debugprint.o mofiles/rmtmod.o mofiles/dbmod.o : mofiles/dbfile.o mofiles/commonmod.o mofiles/debugprint.o db.o : mofiles/dbmod.o mofiles/dbfile.o mofiles/debugprint.o : mofiles/mtargs.o Index: TODO ================================================================== --- TODO +++ TODO @@ -16,10 +16,51 @@ # along with Megatest. If not, see . TODO ==== +23WW48 +. Add calls-per-minute to db access stats +. Find out why start-server calls are taking 250ms and fix +. Allow two or three servers to run for any given db +. Update avg call count/sec every 30 sec in no-sync +. get server uses no-sync process info to decide which server to suggest +. Use process table to decide who will do sync back +. Fix metadat being synced over and over + +23WW47 +. Finding server +.. look at .servinfo for likely prime main +.. ask the .servinfo prime main for real prime main +.. save prime main (for how long, 10 seconds or 10 minutes?) + +. Starting prime main +.. get servinfo files - START +.. no files? create my servinfo file, goto START +.. have files? am I the prime main according to servinfo files? +.. no, I'm not the prime main, ping prime main +.. ping is good, prime main exists, register self as server if on same host as prime main DONE +.. no pirng response, remove the .servinfo file - goto START +.. if I am prime main according to .servinfo files, register directly in no-sync + +. Starting non-main +.. get servinfo files +.. no files? launch server for main.db +.. have files? pick out prime main +.. register self as server with prime main + +23WW46 - v1.80 branch +. Use file semaphore to kill tests, eliminate db load of the KILLREQ query +. Merge this change to revolution branch +23WW45 - the revolution branch +. Add "fast" db start option (no handshaking over NFS) +. Add server-ro to server types (just "server" is fine for read/write). +. [DONE] Create pause-server and resume-server calls +. Create rsync or cp sync to MTRAH function +. Change rmt:send-receive to divert calls to read-only server when possible +. [DONE] Change start server to call main.db server for 1..N.db servers, block until server is read for use. + 23WW21 . Dashboard needs its own cache db in /tmp 23WW07 . Remove use of *dbstruct-dbs* Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -18,317 +18,146 @@ ;; ;;====================================================================== (declare (unit api)) (declare (uses db)) +(declare (uses apimod)) + (declare (uses debugprint)) (declare (uses commonmod)) (declare (uses dbmod)) (declare (uses dbfile)) (declare (uses tasks)) (declare (uses tcp-transportmod)) (import commonmod) +(import apimod) (import dbmod) (import dbfile) (import debugprint) (import tcp-transportmod) (use srfi-69 srfi-18 posix matchable - s11n) - -;; allow these queries through without starting a server -;; -(define api:read-only-queries - '(get-key-val-pairs - get-var - get-keys - get-key-vals - test-toplevel-num-items - get-test-info-by-id - get-test-state-status-by-id - get-steps-info-by-id - get-data-info-by-id - test-get-rundir-from-test-id - get-count-tests-running-for-testname - get-count-tests-running - get-count-tests-running-in-jobgroup - get-previous-test-run-record - get-matching-previous-test-run-records - test-get-logfile-info - test-get-records-for-index-file - get-testinfo-state-status - test-get-top-process-pid - test-get-paths-matching-keynames-target-new - get-prereqs-not-met - get-count-tests-running-for-run-id - get-run-info - get-run-status - get-run-state - get-run-stats - get-run-times - get-target - get-targets - ;; register-run - get-tests-tags - get-test-times - get-tests-for-run - get-tests-for-run-state-status - get-test-id - get-tests-for-runs-mindata - get-tests-for-run-mindata - get-run-name-from-id - get-runs - simple-get-runs - get-num-runs - get-runs-cnt-by-patt - get-all-run-ids - get-prev-run-ids - get-run-ids-matching-target - get-runs-by-patt - get-steps-data - get-steps-for-test - read-test-data - read-test-data-varpatt - login - tasks-get-last - testmeta-get-record - have-incompletes? - get-changed-record-ids - get-all-runids - get-changed-record-test-ids - get-changed-record-run-ids - get-run-record-ids - get-not-completed-cnt)) - -(define api:write-queries - '( - get-keys-write ;; dummy "write" query to force server start - - ;; SERVERS - ;; start-server - ;; kill-server - - ;; TESTS - test-set-state-status-by-id - delete-test-records - delete-old-deleted-test-records - test-set-state-status - test-set-top-process-pid - set-state-status-and-roll-up-items - - update-pass-fail-counts - top-test-set-per-pf-counts ;; (db:top-test-set-per-pf-counts (db:get-db *db* 5) 5 "runfirst") - - ;; RUNS - register-run - set-tests-state-status - delete-run - lock/unlock-run - update-run-event_time - mark-incomplete - set-state-status-and-roll-up-run - ;; STEPS - teststep-set-status! - delete-steps-for-test - ;; TEST DATA - test-data-rollup - csv->test-data - - ;; MISC - sync-cachedb->db - drop-all-triggers - create-all-triggers - update-tesdata-on-repilcate-db - - ;; TESTMETA - testmeta-add-record - testmeta-update-field - - ;; TASKS - tasks-add - tasks-set-state-given-param-key - )) - -(define *db-write-mutexes* (make-hash-table)) -(define *server-signature* #f) -;; ;; These are called by the server on recipt of /api calls -;; ;; - keep it simple, only return the actual result of the call, i.e. no meta info here -;; ;; -;; ;; - returns #( flag result ) -;; ;; -;; (define (api:execute-requests dbstruct dat) -;; (if (> *api-process-request-count* 50) -;; (begin -;; (if (common:low-noise-print 30 "too many threads") -;; (debug:print 0 *default-log-port* "WARNING: "*api-process-request-count*" threads, potential overload, adding 0.5 sec delay.")) -;; ;; (thread-sleep! 0.5) ;; take a nap - no, the napping is moved to the clients via tt:backoff-incr -;; )) -;; (cond -;; ((not (vector? dat)) ;; it is an error to not receive a vector -;; (vector #f (vector #f "remote must be called with a vector"))) -;; (else -;; (let* ((cmd-in (vector-ref dat 0)) -;; (cmd (if (symbol? cmd-in) -;; cmd-in -;; (string->symbol cmd-in))) -;; (params (vector-ref dat 1)) -;; (run-id (if (null? params) -;; 0 -;; (car params))) -;; (write-mutex (if (hash-table-exists? *db-write-mutexes* run-id) -;; (hash-table-ref *db-write-mutexes* run-id) -;; (let* ((newmutex (make-mutex))) -;; (hash-table-set! *db-write-mutexes* run-id newmutex) -;; newmutex))) -;; (start-t (current-milliseconds)) -;; (readonly-mode (dbr:dbstruct-read-only dbstruct)) -;; (readonly-command (member cmd api:read-only-queries)) -;; (writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))) -;; (if (not readonly-command) -;; (mutex-lock! write-mutex)) -;; (let* ((tmppath (dbr:dbstruct-tmppath dbstruct)) -;; (clean-run-id (cond -;; ((number? run-id) run-id) -;; ((equal? run-id #f) "main") -;; (else "other"))) -;; (crumbfile (dbfile:wait-for-qif tmppath clean-run-id (cons cmd params))) -;; (res -;; (if writecmd-in-readonly-mode -;; (conc "attempt to run write command "cmd" on a read-only database") -;; (api:dispatch-request dbstruct cmd run-id params)))) -;; (delete-file* crumbfile) -;; (if (not readonly-command) -;; (mutex-unlock! write-mutex)) -;; -;; ;; save all stats -;; (let ((delta-t (- (current-milliseconds) -;; start-t)) -;; (modified-cmd (if (eq? cmd 'general-call) -;; (string->symbol (conc "general-call-" (car params))) -;; cmd))) -;; (hash-table-set! *db-api-call-time* modified-cmd -;; (cons delta-t (hash-table-ref/default *db-api-call-time* modified-cmd '())))) -;; (if writecmd-in-readonly-mode -;; (begin -;; #;(common:telemetry-log (conc "api-out:"(->string cmd)) -;; payload: `((params . ,params) -;; (ok-res . #t))) -;; (vector #f res)) -;; (begin -;; #;(common:telemetry-log (conc "api-out:"(->string cmd)) -;; payload: `((params . ,params) -;; (ok-res . #f))) -;; (vector #t res)))))))) - -(define *api-threads* '()) -(define (api:register-thread th-in) - (set! *api-threads* (cons (cons th-in (current-seconds)) *api-threads*))) - -(define (api:unregister-thread th-in) - (set! *api-threads* (filter (lambda (thdat) - (not (eq? th-in (car thdat)))) - *api-threads*))) - -(define (api:remove-dead-or-terminated) - (set! *api-threads* (filter (lambda (thdat) - (not (member (thread-state (car thdat)) '(terminated dead)))) - *api-threads*))) - -(define (api:get-count-threads-alive) - (length *api-threads*)) - + s11n + typed-records) + + +;; QUEUE METHOD + +(define (api:tcp-dispatch-request-make-handler-new dbstruct) ;; cmd run-id params) + (api:tcp-dispatch-request-make-handler-core dbstruct api:dispatch-request)) + ;; indat is (cmd run-id params meta) ;; ;; WARNING: Do not print anything in the lambda of this function as it ;; reads/writes to current in/out port ;; -(define (api:tcp-dispatch-request-make-handler dbstruct) ;; cmd run-id params) +(define (api:tcp-dispatch-request-make-handler-old dbstruct) ;; cmd run-id params) (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.") (if (not *server-signature*) (set! *server-signature* (tt:mk-signature *toppath*))) (lambda (indat) (api:register-thread (current-thread)) - (let* (;; (indat (deserialize)) - (newcount (+ *api-process-request-count* 1)) - (numthreads (api:get-count-threads-alive)) - (delay-wait (if (> newcount 10) - (- newcount 10) - 0)) - (normal-proc (lambda (cmd run-id params) - (case cmd - ((ping) *server-signature*) - (else - (api:dispatch-request dbstruct cmd run-id params)))))) - (set! *api-process-request-count* newcount) - (set! *db-last-access* (current-seconds)) - (if (not (eq? newcount numthreads)) - (begin - (api:remove-dead-or-terminated) - (let ((threads-now (api:get-count-threads-alive))) - (debug:print 0 *default-log-port* "WARNING: newcount="newcount", numthreads="numthreads", remaining="threads-now) - (set! newcount threads-now)))) - (match indat - ((cmd run-id params meta) - (let* ((db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id)) - (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct)))) - (case cmd - ((ping) #t) ;; we are fine - (else - (if (not ok)(debug:print 0 *default-log-port* "ERROR: "cmd", run-id "run-id", not correct for dbfname "(dbr:dbstruct-dbfname dbstruct))) - (assert ok "FATAL: database file and run-id not aligned."))))) - (ttdat *server-info*) - (server-state (tt-state ttdat)) - (status (cond - ((> newcount 3) 'busy) - ;; ((> newcount 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down. - (else 'ok))) - (errmsg (case status - ((busy) (conc "Server overloaded, "newcount" threads in flight")) - ((loaded) (conc "Server loaded, "newcount" threads in flight")) - (else #f))) - (result (case status - ((busy) - (if (eq? cmd 'ping) - (normal-proc cmd run-id params) - ;; newcount must be greater than 5 for busy - (* 1 (- newcount 3)) ;; was 15 - )) ;; (- newcount 29)) ;; call back in as many seconds - ((loaded) -;; (if (eq? (rmt:transport-mode) 'tcp) -;; (thread-sleep! 0.5)) - (normal-proc cmd run-id params)) - (else - (normal-proc cmd run-id params)))) - (meta (case cmd - ((ping) `((sstate . ,server-state))) - (else `((wait . ,delay-wait))))) - (payload (list status errmsg result meta))) - (set! *api-process-request-count* (- *api-process-request-count* 1)) - ;; (serialize payload) - (api:unregister-thread (current-thread)) - payload)) - (else - (assert #f "FATAL: failed to deserialize indat "indat)))))) - + (let* ((result + (let* ((numthreads (api:get-count-threads-alive)) + (delay-wait (if (> numthreads 10) + (- numthreads 10) + 0)) + (normal-proc (lambda (cmd run-id params) + (case cmd + ((ping) *server-signature*) + (else + (api:dispatch-request dbstruct cmd run-id params)))))) + (set! *api-process-request-count* numthreads) + (set! *db-last-access* (current-seconds)) +;; (if (not (eq? numthreads numthreads)) +;; (begin +;; (api:remove-dead-or-terminated) +;; (let ((threads-now (api:get-count-threads-alive))) +;; (debug:print 0 *default-log-port* "WARNING: numthreads="numthreads", numthreads="numthreads", remaining="threads-now) +;; (set! numthreads threads-now)))) + (match indat + ((cmd run-id params meta) + (let* ((start-t (current-milliseconds)) + (db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id)) + (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct)))) + (case cmd + ((ping) #t) ;; we are fine + (else + (assert ok "FATAL: database file and run-id not aligned."))))) + (ttdat *server-info*) + (server-state (tt-state ttdat)) + (maxthreads 20) ;; make this a parameter? + (status (cond + ((and (> numthreads maxthreads) + (> (random 100) 70)) ;; allow a 30% probability to go through so we can figure out what is going wrong in main.db server. + 'busy) + ;; ((> numthreads 5) 'loaded) ;; this gets transmitted to the client which calls tt:backoff-incr to slow stuff down. + (else 'ok))) + (errmsg (case status + ((busy) (conc "Server overloaded, "numthreads" threads in flight")) + ((loaded) (conc "Server loaded, "numthreads" threads in flight")) + (else #f))) + (result (case status + ((busy) + (if (eq? cmd 'ping) + (normal-proc cmd run-id params) + ;; numthreads must be greater than 5 for busy + (* 0.1 (- numthreads maxthreads)) ;; was 15 - return a number for the remote to delay + )) ;; (- numthreads 29)) ;; call back in as many seconds + ((loaded) + ;; (if (eq? (rmt:transport-mode) 'tcp) + ;; (thread-sleep! 0.5)) + (normal-proc cmd run-id params)) + (else + (normal-proc cmd run-id params)))) + (meta (case cmd + ((ping) `((sstate . ,server-state))) + (else `((wait . ,delay-wait))))) + (payload (list status errmsg result meta))) + ;; (cmd run-id params meta) + (db:add-stats cmd run-id params (- (current-milliseconds) start-t)) + payload)) + (else + (assert #f "FATAL: failed to deserialize indat "indat)))))) + ;; (set! *api-process-request-count* (- *api-process-request-count* 1)) + ;; (serialize payload) + + (api:unregister-thread (current-thread)) + result))) + +(define api:tcp-dispatch-request-make-handler api:tcp-dispatch-request-make-handler-old) ;; choose -old or -new + +(define *api-halt-writes* #f) (define (api:dispatch-request dbstruct cmd run-id params) (if (not *no-sync-db*) (db:open-no-sync-db)) + (let* ((start-time (current-milliseconds))) + (if (member cmd api:write-queries) + (let loop () + (if *api-halt-writes* + (begin + (thread-sleep! 0.2) + (if (< (- (current-milliseconds) start-time) + 5000) ;; hope it don't take more than five seconds to sync + (loop-time) + #;(debug:print 0 *default-log-port* "ERROR: writes halted for more than 5 seconds, sync might be taking too long")))))) + (db:add-stats 'api-write-blocking-for-sync run-id params (- (current-milliseconds) start-time))) (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl ;; SERVERS - ((start-server) (apply server:kind-run params)) + ((start-server) (apply tt:server-process-run params)) ((kill-server) (set! *server-run* #f)) ;; TESTS ;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) @@ -513,43 +342,5 @@ ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) (else (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) (conc "ERROR: BAD api call " cmd)))) -;; http-server send-response -;; api:process-request -;; db:* -;; -;; NB// Runs on the server as part of the server loop -;; -(define (api:process-request dbstruct $) ;; the $ is the request vars proc - (debug:print 4 *default-log-port* "server-id:" *server-id*) - (let* ((cmd ($ 'cmd)) - (paramsj ($ 'params)) - (key ($ 'key)) - (params (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?) - (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key) - (if (equal? key *server-id*) - (begin - (set! *api-process-request-count* (+ *api-process-request-count* 1)) - (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?) - (debug:print 4 *default-log-port* "res:" res) - (if (not success) - (debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params)) - (if (> *api-process-request-count* *max-api-process-requests*) - (set! *max-api-process-requests* *api-process-request-count*)) - (set! *api-process-request-count* (- *api-process-request-count* 1)) - ;; This can be here but needs controls to ensure it doesn't run more than every 4 seconds - ;; (rmt:dat->json-str - ;; (if (or (string? res) - ;; (list? res) - ;; (number? res) - ;; (boolean? res)) - ;; res - ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) - (db:obj->string res transport: 'http))) - (begin - (debug:print 0 *default-log-port* "Server refused to process request. Server id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) - (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http))))) - Index: apimod.scm ================================================================== --- apimod.scm +++ apimod.scm @@ -18,15 +18,315 @@ ;;====================================================================== (declare (unit apimod)) (declare (uses commonmod)) +(declare (uses debugprint)) +(declare (uses dbmod)) +(declare (uses dbfile)) +(declare (uses tcp-transportmod)) (module apimod * (import scheme chicken data-structures extras) -(import (prefix sqlite3 sqlite3:) posix typed-records srfi-18) +(import (prefix sqlite3 sqlite3:) posix matchable typed-records srfi-1 srfi-18 srfi-69 ) (import commonmod) +(import debugprint) +(import dbmod) +(import dbfile) +(import tcp-transportmod) + +;; allow these queries through without starting a server +;; +(define api:read-only-queries + '(get-key-val-pairs + get-var + get-keys + get-key-vals + test-toplevel-num-items + get-test-info-by-id + get-test-state-status-by-id + get-steps-info-by-id + get-data-info-by-id + test-get-rundir-from-test-id + get-count-tests-running-for-testname + get-count-tests-running + get-count-tests-running-in-jobgroup + get-previous-test-run-record + get-matching-previous-test-run-records + test-get-logfile-info + test-get-records-for-index-file + get-testinfo-state-status + test-get-top-process-pid + test-get-paths-matching-keynames-target-new + get-prereqs-not-met + get-count-tests-running-for-run-id + get-run-info + get-run-status + get-run-state + get-run-stats + get-run-times + get-target + get-targets + ;; register-run + get-tests-tags + get-test-times + get-tests-for-run + get-tests-for-run-state-status + get-test-id + get-tests-for-runs-mindata + get-tests-for-run-mindata + get-run-name-from-id + get-runs + simple-get-runs + get-num-runs + get-runs-cnt-by-patt + get-all-run-ids + get-prev-run-ids + get-run-ids-matching-target + get-runs-by-patt + get-steps-data + get-steps-for-test + read-test-data + read-test-data-varpatt + login + tasks-get-last + testmeta-get-record + have-incompletes? + get-changed-record-ids + get-all-runids + get-changed-record-test-ids + get-changed-record-run-ids + get-run-record-ids + get-not-completed-cnt)) + +(define api:write-queries + '( + get-keys-write ;; dummy "write" query to force server start + + ;; SERVERS + ;; start-server + ;; kill-server + + ;; TESTS + test-set-state-status-by-id + delete-test-records + delete-old-deleted-test-records + test-set-state-status + test-set-top-process-pid + set-state-status-and-roll-up-items + + update-pass-fail-counts + top-test-set-per-pf-counts ;; (db:top-test-set-per-pf-counts (db:get-db *db* 5) 5 "runfirst") + + ;; RUNS + register-run + set-tests-state-status + delete-run + lock/unlock-run + update-run-event_time + mark-incomplete + set-state-status-and-roll-up-run + ;; STEPS + teststep-set-status! + delete-steps-for-test + ;; TEST DATA + test-data-rollup + csv->test-data + + ;; MISC + sync-cachedb->db + drop-all-triggers + create-all-triggers + update-tesdata-on-repilcate-db + + ;; TESTMETA + testmeta-add-record + testmeta-update-field + + ;; TASKS + tasks-add + tasks-set-state-given-param-key + )) + +(define *db-write-mutexes* (make-hash-table)) +(define *server-signature* #f) + +(define *api-threads* '()) +(define (api:register-thread th-in) + (set! *api-threads* (cons (cons th-in (current-seconds)) *api-threads*))) + +(define (api:unregister-thread th-in) + (set! *api-threads* (filter (lambda (thdat) + (not (eq? th-in (car thdat)))) + *api-threads*))) + +(define (api:remove-dead-or-terminated) + (set! *api-threads* (filter (lambda (thdat) + (not (member (thread-state (car thdat)) '(terminated dead)))) + *api-threads*))) + +(define (api:get-count-threads-alive) + (length *api-threads*)) + +(define *api:last-stats-print* 0) +(define *api-print-db-stats-mutex* (make-mutex)) +(define (api:print-db-stats) + (debug:print-info 0 *default-log-port* "Started periodic db stats printer") + (let loop () + (mutex-lock! *api-print-db-stats-mutex*) + (if (> (- (current-seconds) *api:last-stats-print*) 15) + (begin + (dbmod:print-db-stats) + (set! *api:last-stats-print* (current-seconds)))) + (mutex-unlock! *api-print-db-stats-mutex*) + (thread-sleep! 5) + (loop))) + +;; QUEUE METHOD + +(define *api:queue-mutex* (make-mutex)) +(define *api:queue-id* 0) + +(define *api:in-queue* '()) +(define *api:results* (make-hash-table)) ;; id->queue-item + +(defstruct api:queue-item + (proc #f) + (cmd #f) + (run-id #f) + (params #f) + (start-time (current-seconds)) + (end-time #f) + (id #f) + (results #f)) + +;; Add an item to the incoming queue. +;; +(define (api:add-queue-item proc cmd run-id params) + (mutex-lock! *api:queue-mutex*) + (set! *api:queue-id* (+ *api:queue-id* 1)) + (set! *api:in-queue* + (cons (make-api:queue-item + proc: proc + cmd: cmd + run-id: run-id + params: params + id: *api:queue-id* + ) + *api:in-queue*)) + (let ((id *api:queue-id*)) + (mutex-unlock! *api:queue-mutex*) + id)) ;; return id so calling proc can find the result in *api:results* + +;; get a queue item from the end of the queue. +;; return #f if there are no items to be processed. +;; +(define (api:get-queue-item) + (mutex-lock! *api:queue-mutex*) + (let* ((res (if (null? *api:in-queue*) + #f + (let* ((revlist (reverse *api:in-queue*))) + (set! *api:in-queue* (reverse (cdr revlist))) + (car revlist))))) + (mutex-unlock! *api:queue-mutex*) + res)) + +(define (api:put-item-in-results id item) + (hash-table-set! *api:results* id item)) + +(define (api:retrieve-result-item id) + (let ((res (hash-table-ref/default *api:results* id #f))) + (if res + (begin + (hash-table-delete! *api:results* id) + res) + #f))) + +;; timeout is in ms, poll less frequently over time +;; +;; Yes, it would be better to do this with mailboxes. My last attempt to use +;; mailboxes resulted in erratic behavior but that was likely due to something +;; unrelated. Just to eliminate uncertainty we'll start with polling and switch +;; to mailboxes laters. +;; +(define (api:wait-for-result id #!key (timeout 30000)) + (let loop ((start (current-milliseconds))) + (thread-sleep! (let ((delta (- (current-milliseconds) start))) + (cond + ((< delta 500) 0.01) + ((< delta 5000) 0.1) + ((< delta 10000) 0.25) + (else 1.25)))) + (let ((res (api:retrieve-result-item id))) + (if res + (api:queue-item-results res) + (loop start))))) + +(define (api:queue-run-one) + (let* ((item (api:get-queue-item))) ;; this removes it from the in-queue + (if item + (let* ((id (api:queue-item-id item)) + (proc (api:queue-item-proc item)) + (result (proc))) + (api:queue-item-end-time-set! item (current-seconds)) + (api:queue-item-results-set! item result) + (api:put-item-in-results id item))))) +(define (api:queue-processor) + (let* ((thproc (lambda () + (let loop () + (api:queue-run-one) + (thread-sleep! 0.1) + (loop))))) + (let loop ((thnum 0)) + (thread-start! (make-thread thproc (conc "queue-thread-" thnum))) + (thread-sleep! 0.05) + (if (< thnum 20) + (loop (+ thnum 1)))))) +(define (api:tcp-dispatch-request-make-handler-core dbstruct api:dispatch-request) + (assert *toppath* "FATAL: api:tcp-dispatch-request-make-handler called but *toppath* not set.") + (if (not *server-signature*) + (set! *server-signature* (tt:mk-signature *toppath*))) + (lambda (indat) + (let* ((outer-proc (lambda (cmd run-id params) + (case cmd + ((ping) *server-signature*) ;; but ping in api:dispatch-request is (current-process-id)? + (else + (let* ((id (api:add-queue-item + (lambda () + (api:dispatch-request dbstruct cmd run-id params)) + cmd run-id params))) + (api:wait-for-result id))))))) + ;; (set! *api-process-request-count* numthreads) + (set! *db-last-access* (current-seconds)) + (match indat + ((cmd run-id params meta) + (let* ((start-t (current-milliseconds)) + ;; factor this out and move before this let, it is just + ;; an assert if not ping and dbfname is not correct + (db-ok (let* ((dbfname (dbmod:run-id->dbfname run-id)) + (ok (equal? dbfname (dbr:dbstruct-dbfname dbstruct)))) + (case cmd + ((ping) #t) ;; we are fine + (else + (assert ok "FATAL: database file and run-id not aligned."))))) + (ttdat *server-info*) + (server-state (tt-state ttdat)) + (status 'ok) ;; anything legit we can do with status? + (delay-wait 0) + (result (if (eq? cmd 'ping) + *server-signature* ;; (current-process-id) ;; process id or server-signature? + (outer-proc cmd run-id params))) + (meta (case cmd + ((ping) `((sstate . ,server-state))) + (else `((wait . ,delay-wait))))) + (errmsg "") + (payload (list status errmsg result meta))) + ;; (cmd run-id params meta) + (db:add-stats cmd run-id params (- (current-milliseconds) start-t)) + payload)) + (else + (assert #f "FATAL: failed to deserialize indat "indat)))))) ) Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -359,11 +359,11 @@ (archive-dir (if archive-info (cdr archive-info) #f)) (archive-id (if archive-info (car archive-info) -1)) (home-host (server:choose-server *toppath* 'homehost)) (archive-time (seconds->std-time-str (current-seconds))) (archive-staging-db (conc *toppath* "/.db-snapshot/archive_" archive-time)) - (tmp-db-path (conc (common:get-db-tmp-area) "/megatest.db")) + (tmp-db-path (conc (dbfile:make-tmpdir-name *toppath* "") "/megatest.db")) (dbfile (conc archive-staging-db "/megatest.db"))) (create-directory archive-staging-db #t) (let-values (((pid-val exit-status exit-code) (run-n-wait rsync-exe params: (list "-v" (conc (car home-host) ":"tmp-db-path) archive-staging-db) print-cmd: print-prefix))) (if (eq? exit-code 0) (case archiver @@ -407,11 +407,11 @@ (bup-restore-params (list "-d" archive-path "restore" "-C" *toppath* archive-internal-path))) (debug:print-info 2 *default-log-port* "Restoring archived data to " *toppath* " from archive in " archive-path " ... " archive-internal-path) (run-n-wait bup-exe params: bup-restore-params print-cmd: "Running:")) (sleep 2) (db:multi-db-sync - (db:setup #t) ;; (db:setup-db *dbstruct-dbs* *toppath* #f) + (db:setup) ;; (db:setup-db *dbstruct-dbs* *toppath* #f) 'killservers ;'dejunk ;'adj-testids 'old2new ) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -21,10 +21,11 @@ (declare (unit common)) (declare (uses commonmod)) (declare (uses rmtmod)) (declare (uses debugprint)) (declare (uses mtargs)) + (use srfi-1 data-structures posix regex-case (prefix base64 base64:) format dot-locking csv-xml z3 udp ;; sql-de-lite hostinfo md5 message-digest typed-records directory-utils stack matchable regex posix (srfi 18) extras ;; tcp @@ -139,11 +140,11 @@ (define *pkts-info* (make-hash-table)) ;; store stuff like the last parent here (define *configinfo* #f) ;; raw results from setup, includes toppath and table from megatest.config (define *runconfigdat* #f) ;; run configs data (define *configdat* #f) ;; megatest.config data (define *configstatus* #f) ;; status of data; 'fulldata : all processing done, #f : no data yet, 'partialdata : partial read done -(define *toppath* #f) +;; (define *toppath* #f) ;; moved to commonmod (define *already-seen-runconfig-info* #f) (define *test-meta-updated* (make-hash-table)) (define *globalexitstatus* 0) ;; attempt to work around possible thread issues (define *passnum* 0) ;; when running track calls to run-tests or similar @@ -153,14 +154,10 @@ (define *time-zero* (current-seconds)) ;; for the watchdog (define *on-exit-procs* '()) ;; add procs to this list to be executed on exit (define *default-area-tag* "local") ;; DATABASE -;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this. -;; db stats -(define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total > -(define *db-stats-mutex* (make-mutex)) ;; db access (define *db-last-access* (current-seconds)) ;; last db access, used in server ;; (define *db-write-access* #t) ;; db sync ;; (define *db-last-sync* 0) ;; last time the sync to megatest.db happened @@ -179,13 +176,12 @@ (define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg (define *runremote* #f) ;; if set up for server communication this will hold ;; (define *max-cache-size* 0) (define *logged-in-clients* (make-hash-table)) (define *server-id* #f) -(define *server-info* #f) ;; good candidate for easily convert to non-global +;; (define *server-info* #f) ;; good candidate for easily convert to non-global (define *time-to-exit* #f) -(define *server-run* #t) (define *run-id* #f) (define *server-kind-run* (make-hash-table)) (define *home-host* #f) ;; (define *total-non-write-delay* 0) (define *heartbeat-mutex* (make-mutex)) @@ -247,11 +243,11 @@ (define *common:this-exe-fullpath* (common:get-this-exe-fullpath)) (define *common:this-exe-dir* (pathname-directory *common:this-exe-fullpath*)) (define *common:this-exe-name* (pathname-strip-directory *common:this-exe-fullpath*)) (define (common:get-sync-lock-filepath) - (let* ((tmp-area (common:get-db-tmp-area)) + (let* ((tmp-area (common:make-tmpdir-name *toppath* "")) (lockfile (conc tmp-area "/megatest.db.lock"))) lockfile)) (define *common:logpro-exit-code->status-sym-alist* '( ( 0 . pass ) @@ -429,12 +425,11 @@ 'adj-target 'new2old '(dejunk) )) ((tcp nfs) - (debug:print 0 *default-log-port* "WARNING: cleanup-db NOT implemented yet for tcp and nfs.") - #;(apply db:multi-db-sync + (apply db:multi-db-sync dbstruct 'schema 'killservers 'adj-target 'new2old @@ -618,11 +613,11 @@ (common:on-homehost?)) (if (common:api-changed?) (let* ((mtconf (conc (get-environment-variable "MT_RUN_AREA_HOME") "/megatest.config")) (dbfile (conc (get-environment-variable "MT_RUN_AREA_HOME") ".mtdb/main.db")) (read-only (not (file-write-access? dbfile))) - (dbstruct (db:setup #t))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) + (dbstruct (db:setup))) ;; (db:setup-db *dbstruct-dbs* *toppath* #f))) ;; #t))) (debug:print 0 *default-log-port* "WARNING: Version mismatch!\n" " expected: " (common:version-signature) "\n" " got: " (common:get-last-run-version)) (cond @@ -1533,11 +1528,11 @@ ;; (define (common:lazy-modification-time fpath) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "Failed to get modifcation time for " fpath ", treating it as zero. exn=" exn) + (debug:print 2 *default-log-port* "Failed to get modification time for " fpath ", treating it as zero. exn=" exn) 0) (if (file-exists? fpath) (file-modification-time fpath) 0))) @@ -1649,12 +1644,12 @@ ;; (let loop ((x 0)) ;; (print x "," (common:get-delay x 1)) ;; (if (< x 2) ;; (loop (+ x 0.1))))) -(define (get-cpu-load #!key (remote-host #f)) - (car (common:get-cpu-load remote-host))) +;; (define (get-cpu-load #!key (remote-host #f)) +;; (car (common:get-cpu-load remote-host))) ;;====================================================================== ;; (let* ((load-res (process:cmd-run->list "uptime")) ;; (load-rx (regexp "load average:\\s+(\\d+)")) ;; (cpu-load #f)) @@ -1666,18 +1661,18 @@ ;; (set! cpu-load newval)))))) ;; (car load-res)) ;; cpu-load)) ;;====================================================================== -;; get values from cached info from dropping file in logs dir +;; get values from cached info from dropping file in .sysdata dir ;; e.g. key is host and dtype is normalized-load ;; (define (common:get-cached-info key dtype #!key (age 10)) (if *toppath* (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")) (delfile (lambda (exn) - (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn) + (debug:print-info 2 *default-log-port* " removing bad file " fullpath ", exn=" exn) (delete-file* fullpath) #f))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions @@ -2280,11 +2275,11 @@ (define (common:check-db-dir-space) (let* ((required (string->number ;; default is 1GB (or actually a billion bytes) This is the number of 1 kB blocks. (or (configf:lookup *configdat* "setup" "dbdir-space-required") "1000000"))) - (dbdir (common:get-db-tmp-area)) ;; (db:get-dbdir)) + (dbdir (common:make-tmpdir-name *toppath* "")) ;; (db:get-dbdir)) (tdbspace (common:check-space-in-dir dbdir required)) (mdbspace (common:check-space-in-dir *toppath* required))) (sort (list tdbspace mdbspace) (lambda (a b) (< (cadr a)(cadr b)))))) Index: commonmod.scm ================================================================== --- commonmod.scm +++ commonmod.scm @@ -64,10 +64,11 @@ chicken.condition chicken.file chicken.file.posix chicken.io chicken.pathname + chicken.port chicken.process chicken.process-context chicken.process-context.posix chicken.sort chicken.string @@ -83,10 +84,12 @@ srfi-1 srfi-18 srfi-69 typed-records system-information + + debugprint ))) ;;====================================================================== ;; CONTENTS ;; @@ -132,10 +135,13 @@ (define (client:get-signature) (if *my-client-signature* *my-client-signature* (let ((sig (conc (get-host-name) " " (current-process-id)))) (set! *my-client-signature* sig) *my-client-signature*))) + +(define *server-info* #f) +(define *toppath* #f) ;;====================================================================== ;; config file utils ;;====================================================================== @@ -160,10 +166,17 @@ '()))) ;; should it return empty list or #f to indicate not set? (define (get-section cfgdat section) (hash-table-ref/default cfgdat section '())) + +(define (common:make-tmpdir-name areapath tmpadj) + (let* ((area (pathname-file areapath)) + (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb"))) + (unless (directory-exists? dname) + (create-directory dname #t)) + dname)) ;; dot-locking egg seems not to work, using this for now ;; if lock is older than expire-time then remove it and try again ;; to get the lock ;; @@ -287,13 +300,16 @@ (filter (lambda (x) (not (string-match "^\\s*" x))) val-list)) '()))) -(define (get-cpu-load) - (let* ((load-info (with-input-from-file "/proc/loadavg" read-lines))) - (map string->number (string-split load-info)))) +(define (commonmod:get-cpu-load) + (let* ((load-info (with-input-from-file "/proc/loadavg" read-lines)) + (res (map string->number (string-split (car load-info))))) + (if (null? res) + #f ;; something is wrong + (car res)))) (define *current-host-cores* #f) (define (get-current-host-cores) (or *current-host-cores* @@ -315,11 +331,11 @@ (string->number (read-line))))) ;; get the normalized (i.e. load / numcpus) for *this* host ;; (define (get-normalized-cpu-load) - (/ (get-cpu-load)(get-current-host-cores))) + (/ (commonmod:get-cpu-load)(get-current-host-cores))) ;;====================================================================== ;; testsuite and area utilites ;;====================================================================== @@ -399,14 +415,13 @@ ((d) 86400) ((w) 604800) ((M) 2628000) ;; aproximately one month ((y) 31536000) (else - 0))))))) - ;; (print "ERROR: can't parse timestring "tstr", component "part) - ;; can't (yet) use debugprint. rely on -show-config for user to find errors - ))) + 0))))) + (debug:print 0 *default-log-port* "ERROR: can't parse timestring "tstr", component "part", string: "(cadr match)))) + (debug:print 0 *default-log-port* "ERROR: can't parse timestring "tstr", component "part)))) parts) time-secs)) (define (seconds->hr-min-sec secs) (let* ((hrs (quotient secs 3600)) Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -31,10 +31,15 @@ (declare (uses mtargs)) (declare (uses mtargs.import)) (declare (uses common)) (declare (uses commonmod)) (declare (uses commonmod.import)) +(declare (uses dbfile)) +(declare (uses dbfile.import)) +(declare (uses dbmod)) +(declare (uses dbmod.import)) + (import commonmod (prefix mtargs args:) debugprint) (include "common_records.scm") Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -23,10 +23,11 @@ ;;====================================================================== (declare (unit dashboard-tests)) (declare (uses common)) (declare (uses commonmod)) +(declare (uses dcommon)) (declare (uses db)) (declare (uses gutils)) (declare (uses rmt)) (declare (uses ezsteps)) (declare (uses subrun)) @@ -463,11 +464,11 @@ ;;====================================================================== ;; ;;====================================================================== (define (dashboard-tests:examine-test run-id test-id) ;; run-id run-key origtest) - (let* ((db-path (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db")) + (let* ((db-path (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/db/" run-id ".db")) (dbstruct #f) ;; NOT USED (testdat (rmt:get-test-info-by-id run-id test-id)) ;; (db:get-test-info-by-id dbstruct run-id test-id)) (db-mod-time 0) ;; (file-modification-time db-path)) (last-update 0) ;; (current-seconds)) (request-update #t)) ADDED dashboard-transport-mode.scm Index: dashboard-transport-mode.scm ================================================================== --- /dev/null +++ dashboard-transport-mode.scm @@ -0,0 +1,22 @@ +;;====================================================================== +;; set up transport, db cache and sync methods +;; +;; sync-method: 'original, 'attach or 'none +;; cache-method: 'tmp or 'none +;; rmt:transport-mode: 'http, 'tcp, 'nfs +;; +;; NOTE: NOT ALL COMBINATIONS WORK +;; +;;====================================================================== + +;; uncomment this block to test without tcp or cachedb +;; (dbfile:sync-method 'none) +;; (dbfile:cache-method 'none) +;; (rmt:transport-mode 'nfs) + +;; uncomment this block to test with tcp and cachedb +(dbfile:sync-method 'none) ;; original was causing crash on start. +(dbfile:cache-method 'none) +(rmt:transport-mode 'nfs) + + Index: dashboard-transport-mode.scm.template ================================================================== --- dashboard-transport-mode.scm.template +++ dashboard-transport-mode.scm.template @@ -13,10 +13,10 @@ ;; (dbfile:sync-method 'none) ;; (dbfile:cache-method 'none) ;; (rmt:transport-mode 'nfs) ;; uncomment this block to test with tcp and cachedb -(dbfile:sync-method 'attach) ;; original was causing crash on start. +(dbfile:sync-method 'none) ;; original was causing crash on start. (dbfile:cache-method 'none) (rmt:transport-mode 'nfs) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -36,12 +36,14 @@ (declare (uses dashboard-context-menu)) (declare (uses vg)) (declare (uses subrun)) (declare (uses mt)) (declare (uses dbmod)) -(declare (uses rmtmod)) (declare (uses dbfile)) +(declare (uses dbfile.import)) +(declare (uses rmtmod)) +(declare (uses rmtmod.import)) (declare (uses commonmod)) (declare (uses commonmod.import)) (use format) @@ -74,10 +76,12 @@ ;; executables such as dashboard and mtutil ;; (include "dashboard-transport-mode.scm") (dbfile:db-init-proc db:initialize-main-db) (set! rmtmod:send-receive rmt:send-receive) + +(debug:print-info 0 *default-log-port* "transport-mode="(rmt:transport-mode)) (define help (conc "Megatest Dashboard, documentation at http://www.kiatoa.com/fossils/megatest version " megatest-version " license GPL, Copyright (C) Matt Welland 2012-2017 @@ -117,12 +121,12 @@ args:arg-hash 0)) (if (args:get-arg "-mode") (let* ((mode (string->symbol (args:get-arg "-mode")))) - (rmt:transport-mode mode)) - (rmt:transport-mode 'tcp)) + (rmt:transport-mode mode))) +;; (rmt:transport-mode 'tcp)) (if (args:get-arg "-test") ;; need to use tcp for test control panel (rmt:transport-mode 'tcp)) ;; RA => Might require revert for filters @@ -139,33 +143,35 @@ ;; (hash-table-set! args:arg-hash "-use-db-cache" #t)));;;) ;;) ;; data common to all tabs goes here ;; -(defstruct dboard:commondat - ((curr-tab-num 0) : number) - please-update - tabdats - update-mutex - updaters - updating - uidat ;; needs to move to tabdat at some time - hide-not-hide-tabs - target - ) - -(define (dboard:commondat-make) - (make-dboard:commondat - curr-tab-num: 0 - tabdats: (make-hash-table) - please-update: #t - update-mutex: (make-mutex) - updaters: (make-hash-table) - updating: #f - hide-not-hide-tabs: #f - target: "" - )) +;; Moved to dcommon.scm +;; +;; (defstruct dboard:commondat +;; ((curr-tab-num 0) : number) +;; please-update +;; tabdats +;; update-mutex +;; updaters +;; updating +;; uidat ;; needs to move to tabdat at some time +;; hide-not-hide-tabs +;; target +;; ) +;; +;; (define (dboard:commondat-make) +;; (make-dboard:commondat +;; curr-tab-num: 0 +;; tabdats: (make-hash-table) +;; please-update: #t +;; update-mutex: (make-mutex) +;; updaters: (make-hash-table) +;; updating: #f +;; hide-not-hide-tabs: #f +;; target: "" +;; )) ;;====================================================================== ;; buttons color using image ;;====================================================================== @@ -208,39 +214,17 @@ ;; (iup:attribute-set! img1 "2" "255 0 0") (hash-table-set! images name img1) name))) -;; RA => returns the tabdat stored at hashkey passed in commondat-tabdats table (e.g. 0 gives summary) -;; -(define (dboard:common-get-tabdat commondat #!key (tab-num #f)) - (let* ((tnum (or tab-num - (dboard:commondat-curr-tab-num commondat) - 0)) ;; tab-num value is curr-tab-num value in passed commondat - (ht (dboard:commondat-tabdats commondat)) - (res (hash-table-ref/default ht tnum #f))) - (or res - (let ((new-tabdat (dboard:tabdat-make-data))) - (hash-table-set! ht tnum new-tabdat) - new-tabdat)))) - -;; RA => sets the tabdat passed to the hashkey at commondat:tabdats hash table -;; -(define (dboard:common-set-tabdat! commondat tabnum tabdat) - (hash-table-set! - (dboard:commondat-tabdats commondat) - tabnum - tabdat)) - ;; gets and calls updater list based on curr-tab-num ;; (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f)) ;; (sync-db-to-tmp (dboard:common-get-tabdat commondat tab-num: tab-num)) ;; no longer applies ;; maybe need sleep here? - (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat (let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat))) (updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum '()))) @@ -402,12 +386,12 @@ (dboard:setup-tabdat dat) (dboard:setup-num-rows dat) dat)) (define (dboard:setup-tabdat tabdat) - (dboard:tabdat-dbdir-set! tabdat (db:dbfile-path)) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) - (dboard:tabdat-dbfpath-set! tabdat (db:dbfile-path)) + (dboard:tabdat-dbdir-set! tabdat (common:make-tmpdir-name *toppath* "")) ;; (conc (configf:lookup *configdat* "setup" "linktree") "/.db")) + (dboard:tabdat-dbfpath-set! tabdat (common:make-tmpdir-name *toppath* "")) (dboard:tabdat-monitor-db-path-set! tabdat (conc (dboard:tabdat-dbdir tabdat) "/monitor.db")) ;; HACK ALERT: this is a hack, please fix. (dboard:tabdat-ro-set! tabdat (not (file-read-access? (dboard:tabdat-dbfpath tabdat)))) @@ -671,11 +655,11 @@ ;; (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals) (let* ((start-time (current-seconds)) (access-mode (dboard:tabdat-access-mode tabdat)) (num-to-get (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get") - "200"))) + "1000"))) (states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat))) (statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat))) (do-not-use-db-file-timestamps #f) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab (do-not-use-query-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab (sort-info (get-curr-sort)) @@ -749,16 +733,16 @@ (for-each (lambda (tdat) (let ((test-id (db:test-get-id tdat)) (state (db:test-get-state tdat))) - (dboard:rundat-data-changed-set! run-dat #t) - (if (equal? state "DELETED") - (hash-table-delete! tests-ht test-id) - (hash-table-set! tests-ht test-id tdat)))) - tmptests) - + (dboard:rundat-data-changed-set! run-dat #t) + (if (equal? state "DELETED") + (hash-table-delete! tests-ht test-id) + (hash-table-set! tests-ht test-id tdat)))) + tmptests) + tests-ht)) ;; tmptests - new tests data ;; prev-tests - old tests data ;; @@ -853,10 +837,16 @@ (loop run tal new-res newmaxtests) ;; not done getting data for this run (loop (car tal)(cdr tal) new-res newmaxtests))))))) (dboard:tabdat-filters-changed-set! tabdat #f) (dboard:update-tree tabdat runs-hash header tb))) + +(define *dashboard-last-run-id-update* (make-hash-table)) ;; id => seconds + +(define (dboard:clear-run-id-update-hash) + (hash-table-clear! *dashboard-last-run-id-update*)) + ;; this calls dboard:get-tests-for-run-duplicate for each run ;; ;; create a virtual table of all the tests ;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") ) ;; @@ -889,63 +879,82 @@ (dboard:tabdat-item-test-names-set! tabdat '()) (hash-table-clear! (dboard:tabdat-allruns-by-id tabdat))) (let loop ((run (car runs)) (tal (cdr runs)) (res '()) - (maxtests 0)) + (maxtests 0) + (cont-run #f)) (let* ((run-id (db:get-value-by-header run header "id")) + (recently-done (< (- (current-seconds) + (hash-table-ref/default *dashboard-last-run-id-update* run-id 0)) 1)) (run-struct (hash-table-ref/default (dboard:tabdat-allruns-by-id tabdat) run-id #f)) ;; (last-update (if run-struct (dboard:rundat-last-update run-struct) 0)) (key-vals (rmt:get-key-vals run-id)) - (tests-ht (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)) + (tests-ht (let* ((tht (if (and recently-done run-struct) + (let ((rht (dboard:rundat-tests run-struct))) ;; (dboard:tabdat-allruns-by-id tabdat))) + (or rht + (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals))) + (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)))) + (assert (hash-table? tht) "FATAL: But here tht should be a hash-table") + tht)) ;; GET RID OF dboard:get-tests-dat - it is superceded by dboard:get-tests-for-run-duplicate ;; dboard:get-tests-for-run-duplicate - returns a hash table ;; (dboard:get-tests-dat tabdat run-id last-update)) (all-test-ids (hash-table-keys tests-ht)) - (num-tests (length all-test-ids))) - ;; (print "run-struct: " run-struct) - ;; NOTE: bubble-up also sets the global (dboard:tabdat-item-test-names tabdat) - ;; (tests (bubble-up tmptests priority: bubble-type)) - ;; NOTE: 11/01/2013 This routine is *NOT* getting called excessively. - ;; (debug:print 0 *default-log-port* "Getting data for run " run-id " with key-vals=" key-vals) - ;; Not sure this is needed? - (let* ((newmaxtests (max num-tests maxtests)) - ;; (last-update (- (current-seconds) 10)) - (run-struct (or run-struct - (dboard:rundat-make-init - run: run - tests: tests-ht - key-vals: key-vals))) - (new-res (if (null? all-test-ids) - res - (delete-duplicates - (cons run-struct res) - (lambda (a b) - (eq? (db:get-value-by-header (dboard:rundat-run a) header "id") - (db:get-value-by-header (dboard:rundat-run b) header "id")))))) - (elapsed-time (- (current-seconds) start-time))) - (if (null? all-test-ids) + (num-tests (length all-test-ids)) + ;; (print "run-struct: " run-struct) + ;; NOTE: bubble-up also sets the global (dboard:tabdat-item-test-names tabdat) + ;; (tests (bubble-up tmptests priority: bubble-type)) + ;; NOTE: 11/01/2013 This routine is *NOT* getting called excessively. + ;; (debug:print 0 *default-log-port* "Getting data for run " run-id " with key-vals=" key-vals) + ;; Not sure this is needed? + (newmaxtests (max num-tests maxtests)) + ;; (last-update (- (current-seconds) 10)) + (run-struct (or run-struct + (dboard:rundat-make-init + run: run + tests: tests-ht + key-vals: key-vals))) + (new-res (if (null? all-test-ids) + res + (delete-duplicates + (cons run-struct res) + (lambda (a b) + (eq? (db:get-value-by-header (dboard:rundat-run a) header "id") + (db:get-value-by-header (dboard:rundat-run b) header "id")))))) + (elapsed-time (- (current-seconds) start-time))) + (if (null? all-test-ids) (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id) (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) - (if (or (null? tal) - (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update - (begin - (when (> elapsed-time 2) - (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") - (let* ((old-val (iup:attribute *tim* "TIME")) - (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) - (if (< (string->number new-val) 5000) - (begin - (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) - (iup:attribute-set! *tim* "TIME" new-val))))) - (dboard:tabdat-allruns-set! tabdat new-res) - maxtests) - (if (> (dboard:rundat-run-data-offset run-struct) 0) - (loop run tal new-res newmaxtests) ;; not done getting data for this run - (loop (car tal)(cdr tal) new-res newmaxtests))))))) - (dboard:tabdat-filters-changed-set! tabdat #f) - (dboard:update-tree tabdat runs-hash header tb))) + + (if (or (null? tal) + (> elapsed-time 2)) ;; stop loading data after 5 + ;; seconds, on the next call + ;; more data *should* be + ;; loaded since + ;; get-tests-for-run uses last + ;; update + (begin + (when (> elapsed-time 2) + (debug:print 2 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") + (let* ((old-val (iup:attribute *tim* "TIME")) + (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) + (if (< (string->number new-val) 5000) + (begin + (debug:print 2 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) + (iup:attribute-set! *tim* "TIME" new-val))))) + (dboard:tabdat-allruns-set! tabdat new-res) + maxtests) + (if (> (dboard:rundat-run-data-offset run-struct) 0) + (begin + (thread-sleep! 0.2) ;; let the gui re-draw + (loop run tal new-res newmaxtests #t)) ;; not done getting data for this run + (begin + (hash-table-set! *dashboard-last-run-id-update* run-id (current-seconds)) + (loop (car tal)(cdr tal) new-res newmaxtests #f))))))) + (dboard:tabdat-filters-changed-set! tabdat #f) + (dboard:update-tree tabdat runs-hash header tb))) (define *collapsed* (make-hash-table)) (define (toggle-hide lnum uidat) ; fulltestname) (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum)) @@ -1153,15 +1162,15 @@ (drop (dboard:tabdat-all-test-names tabdat) (dboard:tabdat-start-test-offset tabdat)) '()))) (append xl (make-list (- (dboard:tabdat-num-tests tabdat) (length xl)) "")))) (update-labels uidat (dboard:tabdat-all-test-names tabdat)) - (for-each + (for-each ;;run (lambda (rundat) - ;; if rundat is junk clobber it with a decent placeholder (if (or (not rundat) ;; handle padded runs (not (dboard:rundat-run rundat))) + ;; Need to put an empty column in to erase previous contents. (set! rundat (dboard:rundat-make-init key-vals: (map (lambda (x) "")(dboard:tabdat-keys tabdat))))) (let* ((run (dboard:rundat-run rundat)) (testsdat-by-name (dboard:rundat-tests-by-name rundat)) (key-val-dat (dboard:rundat-key-vals rundat)) @@ -1168,23 +1177,22 @@ (run-id (db:get-value-by-header run (dboard:tabdat-header tabdat) "id")) (key-vals (append key-val-dat (list (let ((x (db:get-value-by-header run (dboard:tabdat-header tabdat) "runname"))) (if (string? x) x ""))))) (run-key (string-intersperse key-vals "\n"))) - + ;; fill in the run header key values ;; - (let ((rown 0) + (let ((rown 0) (headercol (vector-ref tableheader coln))) (for-each (lambda (kval) (let* ((labl (vector-ref headercol rown))) (if (not (equal? kval (iup:attribute labl "TITLE"))) (iup:attribute-set! (vector-ref headercol rown) "TITLE" kval)) (set! rown (+ rown 1)))) key-vals)) - - ;; For this run now fill in the buttons for each test + ;; For this run now fill in the buttons for each test ;; (let ((rown 0) (columndat (vector-ref table coln))) (for-each (lambda (testname) @@ -1197,17 +1205,12 @@ ;; testsdat))) (if (not matching) (vector -1 -1 "" "" "" 0 "" "" 0 "" "" "" 0 "" "") ;; (car matching)))) matching))) - (testname (db:test-get-testname testdat)) - (itempath (db:test-get-item-path testdat)) - (testfullname (test:test-get-fullname testdat)) (teststatus (db:test-get-status testdat)) (teststate (db:test-get-state testdat)) - ;;(teststart (db:test-get-event_time test)) - ;;(runtime (db:test-get-run_duration test)) (buttontxt (cond ((member teststate '("COMPLETED" "ARCHIVED")) teststatus) ((and (equal? teststate "NOT_STARTED") (member teststatus '("ZERO_ITEMS" "BLOCKED" "PREQ_FAIL" "PREQ_DISCARDED" "TIMED_OUT" "KEEP_TRYING" "TEN_STRIKES"))) teststatus) @@ -1402,11 +1405,15 @@ tp))) (states (dboard:tabdat-states tabdat)) (statuses (dboard:tabdat-statuses tabdat)) (target (let ((targ-list (dboard:tabdat-target tabdat))) (if targ-list (string-intersperse targ-list "/") "no-target-selected"))) - (run-name (dboard:tabdat-run-name tabdat)) + (run-name (let ((run-input (dboard:tabdat-run-name tabdat)) + ) + (if (equal? run-input "") + "no-runname-specified" + run-input))) (states-str (if (or (not states) (null? states)) "" (conc " -state " (string-intersperse states ",")))) (statuses-str (if (or (not statuses) @@ -2404,20 +2411,21 @@ ) )) "runs-summary-click-callback")))) (runs-summary-updater (lambda () - (mutex-lock! update-mutex) + ;; (mutex-lock! update-mutex) (if (or (dashboard:database-changed? commondat tabdat context-key: 'runs-summary-updater) (dboard:tabdat-view-changed tabdat)) (debug:catch-and-dump (lambda () ;; check that run-matrix is initialized before calling the updater (if run-matrix (dashboard:runs-summary-updater commondat tabdat tb cell-lookup run-matrix))) "dashboard:runs-summary-updater") ) - (mutex-unlock! update-mutex))) + #;(mutex-unlock! update-mutex) + )) (runs-summary-control-panel (dashboard:runs-summary-control-panel tabdat)) ) (dboard:commondat-add-updater commondat runs-summary-updater tab-num: tab-num) (dboard:tabdat-runs-tree-set! tabdat tb) (iup:vbox @@ -2460,11 +2468,11 @@ (iup:vbox (iup:textbox #:size "120x15" #:fontsize "10" #:value "%" #:expand "NO" #:action (lambda (obj unk val) (debug:catch-and-dump - (lambda () + (lambda ()57 (mark-for-update tabdat) (update-search commondat tabdat "test-name" val)) "make-controls"))) (iup:hbox (iup:button "Quit" #:action (lambda (obj) @@ -2479,10 +2487,11 @@ (dboard:tabdat-allruns-by-id-set! tabdat (make-hash-table)) (dboard:tabdat-done-runs-set! tabdat '()) (dboard:tabdat-not-done-runs-set! tabdat '()) (dboard:tabdat-view-changed-set! tabdat #t) (dboard:commondat-please-update-set! commondat #t) + (dboard:clear-run-id-update-hash) (mark-for-update tabdat)) #:expand "NO" #:size "40x15") (iup:button "Collapse" #:action (lambda (obj) (debug:catch-and-dump (lambda () @@ -3112,11 +3121,11 @@ (debug:print 2 *default-log-port* "WARNING: error in accessing databases in get-youngest-run-db-mod-time: " ((condition-property-accessor 'exn 'message) exn) " db-dir="dbdir ", exn=" exn) (current-seconds)) ;; something went wrong - just print an error and return current-seconds (common:max (map (lambda (filen) (file-modification-time filen)) - (glob (conc dbdir "/*.db*")))))) + (cons (conc dbdir "/main.db") (glob (conc dbdir "/?.db"))))))) (define (dashboard:monitor-changed? commondat tabdat) (let* ((run-update-time (current-seconds)) (monitor-db-path (dboard:tabdat-monitor-db-path tabdat)) (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path)) @@ -3137,11 +3146,11 @@ (hash-table-set! (dboard:tabdat-last-db-update tabdat) context newtime)) ;; (define (dashboard:database-changed? commondat tabdat #!key (context-key 'default)) (let* ((run-update-time (current-seconds)) - (dbdir (conc *toppath* "/.mtdb"`)) + (dbdir (conc *toppath* "/.mtdb")) (modtime (dashboard:get-youngest-run-db-mod-time dbdir)) (recalc (dashboard:recalc modtime (dboard:commondat-please-update commondat) (dboard:get-last-db-update tabdat context-key)))) (if recalc @@ -3344,14 +3353,14 @@ (vch (dboard:tabdat-view-changed tabdat))) (if (and cnv dwg vch) (begin (vg:drawing-xoff-set! dwg (dboard:tabdat-xadj tabdat)) (vg:drawing-yoff-set! dwg (dboard:tabdat-yadj tabdat)) - (mutex-lock! mtx) + ;; (mutex-lock! mtx) (canvas-clear! cnv) (vg:draw dwg tabdat) - (mutex-unlock! mtx) + ;; (mutex-unlock! mtx) (dboard:tabdat-view-changed-set! tabdat #f))))) ;; doesn't work. ;; ;;(define (gotoescape tabdat escape) @@ -3631,17 +3640,17 @@ (graph-uly (- (calc-y 0) canvas-margin)) (sec-per-50pt (/ 50 timescale)) ) ;; (print "timeoffset: " timeoffset " timescale: " timescale " run-duration: " (seconds->hr-min-sec run-duration) " width: " width " sec-per-50pt: " sec-per-50pt) ;; (print "timescale: " timescale " timeoffset: " timeoffset " sizex: " sizex " originx: " originx) - (mutex-lock! mtx) + ;; (mutex-lock! mtx) (vg:add-comp-to-lib runslib run-full-name runcomp) ;; Have to keep moving the instantiated box as it is anchored at the lower left ;; this should have worked for x in next statement? (maptime run-start) ;; add 60 to make room for the graph (vg:instantiate drawing "runslib" run-full-name run-full-name 8 (- (calc-y curr-run-start-row) (+ 5 graph-height run-to-run-margin))) - (mutex-unlock! mtx) + ;; (mutex-unlock! mtx) ;; (set! run-start-row (+ max-row 2)) ;; (dboard:tabdat-start-row-set! tabdat (+ new-run-start-row 1)) ;; get tests in list sorted by event time ascending (let testsloop ((test-ids (car hierdat)) ;; loop on tests (NOTE: not items!) (tests-tal (cdr hierdat)) @@ -3742,13 +3751,13 @@ (outln (vg:make-rect-obj -5 lly ulx uly text: run-full-name line-color: (vg:rgb->number 255 0 255 a: 128)))) ; (vg:components-get-extents d1 c1))) ;; this is the box around the run - (mutex-lock! mtx) + ;; (mutex-lock! mtx) (vg:add-obj-to-comp runcomp outln) - (mutex-unlock! mtx) + ;; (mutex-unlock! mtx) ;; this is where we have enough info to place the graph (dboard:graph commondat tabdat tab-num -5 (+ uly 10) ulx (+ uly graph-height 3) run-start run-end timescale maptime run-full-name canvas-margin) (dboard:tabdat-max-row-set! tabdat (+ (dboard:tabdat-max-row tabdat)(quotient (+ graph-height 40 3) row-height))) ;; (vg:instance-move drawing run-full-name 0 (dboard:tabdat-max-row tabdat)) )) @@ -3887,22 +3896,22 @@ ;; tab-num: 2) (iup:callback-set! *tim* "ACTION_CB" (lambda (time-obj) (let ((update-is-running #f)) - (mutex-lock! (dboard:commondat-update-mutex commondat)) - (set! update-is-running (dboard:commondat-updating commondat)) - (if (not update-is-running) - (dboard:commondat-updating-set! commondat #t)) - (mutex-unlock! (dboard:commondat-update-mutex commondat)) - (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update + ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) + (set! update-is-running (dboard:commondat-updating commondat)) + (if (not update-is-running) + (dboard:commondat-updating-set! commondat #t)) + ;; (mutex-unlock! (dboard:commondat-update-mutex commondat)) + (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update (begin (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat) - (mutex-lock! (dboard:commondat-update-mutex commondat)) + ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) (dboard:commondat-updating-set! commondat #f) - (mutex-unlock! (dboard:commondat-update-mutex commondat))) - )) + ;; (mutex-unlock! (dboard:commondat-update-mutex commondat)) + ))) 1)))) ;; (debug:print 0 *default-log-port* "Starting updaters") (let ((th1 (make-thread (lambda () (thread-sleep! 1) (dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab @@ -3923,11 +3932,11 @@ (define (sync-db-to-tmp tabdat) (let* ((db-file "./.mtdb/main.db")) (if (and (not (file-write-access? db-file)) ( > (current-seconds) (+ last-copy-time 5))) (begin - (db:multi-db-sync (db:setup #f) 'old2new) + (db:multi-db-sync (db:setup) 'old2new) (set! last-copy-time (current-seconds)) ) ) ) ) @@ -3944,11 +3953,14 @@ (exit 1)))) '("MT_RUN_AREA_HOME" "MT_MEGATEST" "MT_CMDINFO" "MT_TEST_RUN_DIR" "MT_LINKTREE" "MT_TESTSUITENAME")) ) ) -(setenv "MT_RUN_AREA_HOME" (get-environment-variable "PWD")) +;; This is NOT good +;; (setenv "MT_RUN_AREA_HOME" (get-environment-variable "PWD")) +;; This should be OK but it really should not be necessary +(setenv "MT_RUN_AREA_HOME" (current-directory)) (if (not (null? remargs)) (if remargs (begin (debug:print 0 *default-log-port* "Unrecognised arguments: " (string-intersperse remargs " ")) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -131,15 +131,15 @@ (debug:print-error 0 *default-log-port* " query " stmt " failed, params: " params ", error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn) (print-call-chain (current-error-port)) default))) (apply sqlite3:first-result db stmt params))) -(define (db:setup do-sync) +(define (db:setup) (assert *toppath* "FATAL: db:setup called before launch:setup has been run.") - (let* ((tmpdir (common:get-db-tmp-area))) + (let* ((tmpdir (common:make-tmpdir-name *toppath* ""))) (if (not *dbstruct-dbs*) - (dbfile:setup do-sync *toppath* tmpdir) + (dbfile:setup (conc *toppath* "/.mtdb") tmpdir) *dbstruct-dbs*))) ;; moved from dbfile ;; ;; ADD run-id SUPPORT @@ -267,17 +267,10 @@ ", arguments: " ((condition-property-accessor 'exn 'arguments) exn) ", location: " ((condition-property-accessor 'exn 'location) exn) )) -;; NB// #f => return dbdir only -;; (was planned to be; zeroth db with name=main.db) -;; -;; If run-id is #f return to create and retrieve the path where the db will live. -;; -(define db:dbfile-path common:get-db-tmp-area) - (define (db:set-sync db) (let ((syncprag (configf:lookup *configdat* "setup" "sychronous"))) (sqlite3:execute db (conc "PRAGMA synchronous = " (or syncprag 0) ";")))) @@ -467,11 +460,11 @@ (get-mtime shm-file)))) ;; (define (db:all-db-sync dbstruct) ;; (let* ((dbdat (db:open-db dbstruct #f db:initialize-main-db)) ;; (data-synced 0) ;; count of changed records -;; (tmp-area (common:get-db-tmp-area)) +;; (tmp-area (common:make-tmpdir-name *toppath*)) ;; (dbfiles (glob (conc tmp-area"/.mtdb/*.db"))) ;; (sync-durations (make-hash-table)) ;; (no-sync-db (db:open-no-sync-db))) ;; (for-each ;; (lambda (file) ;; tmp db file @@ -528,23 +521,63 @@ ;; (if dbdat (dbfile:add-dbdat dbstruct #f dbdat)) ;; ) ;; #t) (define (db:kill-servers) - (let* ((servers (server:choose-server *toppath* 'all-valid))) ;; (server:get-list *toppath*)) - (for-each - (lambda (server) - (handle-exceptions - exn - (begin - (debug:print-info 0 *default-log-port* "Unable to get host and/or port from " server ", exn=" exn) - #f) - (match-let (((mod-time host port start-time server-id pid) server)) - (if (and host pid) - (tasks:kill-server host pid))))) - servers) - (delete-file* (common:get-sync-lock-filepath)))) + (let* ((tl (launch:setup)) ;; need this to initialize *toppath* + (servdir (conc *toppath* "/.servinfo")) + (servfiles (glob (conc servdir "/*:*.db"))) + (fmtstr "~10a~22a~10a~25a~25a~8a\n") + (dbfiles (append (glob (conc *toppath* "/.mtdb/main.db")) (glob (conc *toppath* "/.mtdb/?.db"))(glob (conc *toppath* "/.mtdb/??.db")))) + (ttdat (make-tt areapath: *toppath*)) + ) + (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state") + (for-each + (lambda (dbfile) + (let* ( + (dbfname (conc (pathname-file dbfile) ".db")) + (sfiles (tt:find-server *toppath* dbfname)) + ) + (for-each + (lambda (sfile) + (let ( + (sinfos (tt:get-server-info-sorted ttdat dbfname)) + ) + (for-each + (lambda (sinfo) + (let* ( + (db (list-ref sinfo 5)) + (pid (list-ref sinfo 4)) + (host (list-ref sinfo 0)) + (port (list-ref sinfo 1)) + (server-id (list-ref sinfo 3)) + (age (seconds->hr-min-sec (- (current-seconds) (list-ref sinfo 2)))) + (last-mod (seconds->string (list-ref sinfo 2))) + (killed (system (conc "ssh " host " kill " pid " > /dev/null"))) + (dummy2 (sleep 1)) + (state (if (> (system (conc "ssh " host " ps " pid " > /dev/null")) 0) "dead" "alive")) + ) + (format #t fmtstr db (conc host ":" port) pid age last-mod state) + (system (conc "rm " sfile)) + ) + ) + sinfos + ) + ) + ) + sfiles + ) + ) + ) + dbfiles + ) + ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id. + (if (file-exists? (conc *toppath* "/.mtdb/no-sync.db")) + (delete-file (conc *toppath* "/.mtdb/no-sync.db")) + ) + ) +) ;; options: ;; ;; 'killservers - kills all servers ;; 'dejunk - removes junk records @@ -556,34 +589,34 @@ ;; run-ids: '(1 2 3 ...) or #f (for all) ;; (define (db:multi-db-sync dbstruct . options) (let* (;; (dbdat (db:open-db dbstruct #f dbfile:db-init-proc)) (data-synced 0) ;; count of changed records - (tmp-area (common:get-db-tmp-area)) + (tmp-area (common:make-tmpdir-name *toppath* "")) (old2new (member 'old2new options)) (dejunk (member 'dejunk options)) (killservers (member 'killservers options)) (src-area (if old2new *toppath* tmp-area)) - (dest-area (if old2new tmp-area *toppath*)) + (dest-area (if old2new tmp-area (conc *toppath* "/.mtdb"))) (dbfiles (if old2new (glob (conc *toppath* "/.mtdb/*.db")) - (glob (conc tmp-area "/.mtdb/*.db")))) + (glob (conc tmp-area "/*.db")))) (keys (db:get-keys dbstruct)) (sync-durations (make-hash-table))) ;; kill servers - (if killservers (db:kill-servers)) + ;; (if killservers (db:kill-servers)) (if (not dbfiles) (debug:print-error 0 *default-log-port* "no dbfiles found in " (conc *toppath* "/.mtdb")) (for-each (lambda (srcfile) (debug:print-info 3 *default-log-port* "file: " srcfile) (let* ((fname (conc (pathname-file srcfile) ".db")) (basename (pathname-file srcfile)) (run-id (if (string= basename "main") #f (string->number basename))) - (destfile (conc dest-area "/.mtdb/" fname)) - (dest-directory (conc dest-area "/.mtdb/")) + (destfile (conc dest-area "/" fname)) + (dest-directory dest-area) (time1 (file-modification-time srcfile)) (time2 (if (file-exists? destfile) (begin (debug:print-info 2 *default-log-port* "destfile " destfile " exists") (file-modification-time destfile)) @@ -603,33 +636,37 @@ #t) ((and changed *time-to-exit*) ;; last sync #t) (else #f)))) - (if (or dejunk do-cp) + + (if (or dejunk do-cp) (let* ((start-time (current-milliseconds)) - ;; subdb is misnamed - should be dbdat (I think...) - (subdb (dbfile:open-db dbstruct run-id dbfile:db-init-proc)) - ;; (or (dbfile:get-subdb dbstruct run-id) - ;; (dbfile:init-subdb dbstruct run-id dbfile:db-init-proc))) + (subdb (or (dbfile:get-subdb dbstruct run-id) (dbfile:init-subdb dbstruct run-id dbfile:db-init-proc))) + (dbdat (or (dbfile:get-dbdat dbstruct run-id) (dbfile:open-db dbstruct run-id dbfile:db-init-proc))) (mtdb (dbr:subdb-mtdbdat subdb)) ;; ;; BUG: -mrw- I think this next line is wrong. run-id should be the path to .mtdb/.db ;; (tmpdb (dbfile:open-db dbstruct run-id dbfile:db-init-proc))) + (if dejunk + (begin + (debug:print 0 *default-log-port* "Cleaning tmp DB") + (db:clean-up run-id tmpdb) + (debug:print 0 *default-log-port* "Cleaning nfs DB") + (db:clean-up run-id mtdb) + ) + ) (debug:print-info 2 *default-log-port* "delta syncing file: " srcfile ", time diff: " (- time1 time2) " seconds") (if old2new (begin - (if dejunk (db:clean-up run-id mtdb)) (db:sync-tables (db:sync-all-tables-list - dbstruct (db:get-keys dbstruct)) #f mtdb tmpdb)) (begin - (if dejunk (db:clean-up run-id tmpdb)) - (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) #f tmpdb mtdb))) + (db:sync-tables (db:sync-all-tables-list (db:get-keys dbstruct)) #f tmpdb mtdb))) (hash-table-set! sync-durations (conc srcfile ".db") (- (current-milliseconds) start-time))) (debug:print-info 2 *default-log-port* "skipping delta sync. " srcfile " is up to date")))) dbfiles)) data-synced)) @@ -641,11 +678,11 @@ (for-each (lambda (subdb) (let* ((mtdb (dbr:subdb-mtdb subdb)) (tmpdb (db:get-subdb dbstruct run-id)) (refndb (dbr:subdb-refndb subdb)) - (newres (db:sync-tables (db:sync-all-tables-list dbstruct (db:get-keys dbstruct)) last-update tmpdb refndb mtdb))) + (newres (db:sync-tables (db:sync-all-tables-list (db:get-keys dbstruct)) last-update tmpdb refndb mtdb))) ;; (stack-push! (dbr:subdb-dbstack subdb) tmpdb) ;; BUG: verify this is really needed (dbfile:add-dbdat dbstruct run-id tmpdb) (set! res (cons newres res)))) subdbs) @@ -1149,16 +1186,19 @@ ;; 2. Look at run records ;; a. If have tests that are not deleted, set state='unknown' ;; b. .... ;; (define (db:clean-up run-id dbdat) - (debug:print 2 *default-log-port* "db:clean-up") - - (if run-id - (db:clean-up-rundb dbdat) - (db:clean-up-maindb dbdat) + (begin + (debug:print 0 *default-log-port* "Cleaning run DB " run-id) + (db:clean-up-rundb dbdat run-id) + ) + (begin + (debug:print 0 *default-log-port* "Cleaning main DB ") + (db:clean-up-maindb dbdat) + ) ) ) ;; Clean out old junk and vacuum the database @@ -1170,38 +1210,42 @@ ;; b. If test dir gone, delete the test record ;; 2. Look at run records ;; a. If have tests that are not deleted, set state='unknown' ;; b. .... ;; -(define (db:clean-up-rundb dbdat) +(define (db:clean-up-rundb dbdat run-id) ;; (debug:print 0 *default-log-port* "WARNING: db clean up not fully ported to v1.60, cleanup action will be on megatest.db") (let* ((db (dbr:dbdat-dbh dbdat)) - (count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests);")) + (test-count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM tests);")) + (step-count-stmt (sqlite3:prepare db "SELECT (SELECT count(id) FROM test_steps);")) (statements (map (lambda (stmt) (sqlite3:prepare db stmt)) (list - ;; delete all tests that belong to runs that are 'deleted' - ;; (conc "DELETE FROM tests WHERE run_id NOT IN (" (string-intersperse (map conc valid-runs) ",") ");") - ;; delete all tests that are 'DELETED' "DELETE FROM tests WHERE state='DELETED';" + "DELETE FROM test_steps WHERE status = 'DELETED';" + "DELETE FROM tests WHERE run_id IN (SELECT id FROM runs WHERE state = 'deleted');" )))) - ;; (db:delay-if-busy dbdat) (sqlite3:with-transaction db (lambda () (sqlite3:for-each-row (lambda (tot) - (debug:print-info 0 *default-log-port* "Records count before clean: " tot)) - count-stmt) + (debug:print-info 0 *default-log-port* "Test records count before clean: " tot)) + test-count-stmt) + (sqlite3:for-each-row (lambda (tot) + (debug:print-info 0 *default-log-port* "Test_step records count before clean: " tot)) + step-count-stmt) (map sqlite3:execute statements) (sqlite3:for-each-row (lambda (tot) - (debug:print-info 0 *default-log-port* "Records count after clean: " tot)) - count-stmt))) + (debug:print-info 0 *default-log-port* "Test records count after clean: " tot)) + test-count-stmt) + (sqlite3:for-each-row (lambda (tot) + (debug:print-info 0 *default-log-port* "Test_step records count after clean: " tot)) + step-count-stmt))) (map sqlite3:finalize! statements) - (sqlite3:finalize! count-stmt) - ;; (db:find-and-mark-incomplete db) - ;; (db:delay-if-busy dbdat) + (sqlite3:finalize! test-count-stmt) + (sqlite3:finalize! step-count-stmt) (sqlite3:execute db "VACUUM;"))) ;; Clean out old junk and vacuum the database ;; ;; Ultimately do something like this: @@ -1235,15 +1279,15 @@ ;; (db:delay-if-busy dbdat) (sqlite3:with-transaction db (lambda () (sqlite3:for-each-row (lambda (tot) - (debug:print-info 0 *default-log-port* "Records count before clean: " tot)) + (debug:print-info 0 *default-log-port* "Run records count before clean: " tot)) count-stmt) (map sqlite3:execute statements) (sqlite3:for-each-row (lambda (tot) - (debug:print-info 0 *default-log-port* "Records count after clean: " tot)) + (debug:print-info 0 *default-log-port* "Run records count after clean: " tot)) count-stmt))) (map sqlite3:finalize! statements) (sqlite3:finalize! count-stmt) ;; (db:find-and-mark-incomplete db) ;; (db:delay-if-busy dbdat) @@ -1254,11 +1298,11 @@ ;; no-sync.db - small bits of data to be shared between servers ;;====================================================================== (define (db:get-dbsync-path) (case (rmt:transport-mode) - ((http)(common:get-db-tmp-area)) + ((http)(common:make-tmpdir-name *toppath* "")) ((tcp) (conc *toppath*"/.mtdb")) ((nfs) (conc *toppath*"/.mtdb")) (else "/tmp/dunno-this-gonna-exist"))) ;; This is needed for api.scm @@ -1419,62 +1463,85 @@ #f (simple-run-id (car runs))))) ;; called with run-id=#f so will operate on main.db ;; -(define (db:insert-run dbstruct target runname run-meta) +(define (db:insert-run dbstruct run-id target runname run-meta) (let* ((keys (db:get-keys dbstruct)) (runs (db:simple-get-runs dbstruct runname #f #f target #f))) ;; runpatt count offset target last-update ;; need to insert run based on target and runname (let* ((targvals (string-split target "/")) (keystr (string-intersperse keys ",")) (key?str (string-intersperse (make-list (length targvals) "?") ",")) - (qrystr (conc "INSERT INTO runs (runname,"keystr") VALUES (?,"key?str")")) + (qrystr (conc "INSERT INTO runs (id,runname,"keystr") VALUES (?,?,"key?str")")) (get-var (lambda (db qrystr) (let* ((res #f)) (sqlite3:for-each-row (lambda row (set res (car row))) - db qrystr runname) + db qrystr run-id runname) res)))) (if (null? runs) - (db:create-initial-run-record dbstruct runname target)) - (let* ((run-id (db:get-run-id dbstruct runname target))) - (db:with-db + (begin + (db:create-initial-run-record dbstruct run-id runname target) + ) + ) + (let* () + ;;(debug:print 0 *default-log-port* "db:insert-run: Calling db:with-db to update the run record") + (debug:print 0 *default-log-port* "db:insert-run: runid = " run-id) +#; (db:with-db dbstruct #f #t (lambda (dbdat db) + (debug:print 0 *default-log-port* "In the lambda proc for " dbdat " " db) (for-each (lambda (keyval) + (debug:print 0 *default-log-port* "In the lambda proc for " keyval) (let* ((fieldname (car keyval)) (getqry (conc "SELECT "fieldname" FROM runs WHERE id=?;")) (setqry (conc "UPDATE runs SET "fieldname"=? WHERE id=?;")) (val (cdr keyval)) (valnum (if (number? val) val (if (string? val) (string->number val) #f)))) + (debug:print 0 *default-log-port* "fieldname " fieldname " val " val " valnum " valnum) (if (not (member fieldname (cons "runname" keys))) ;; don't attempt to tweak these (let* ((curr-val (get-var db getqry)) (have-it (or (equal? curr-val val) (equal? curr-val valnum)))) + (debug:print 0 *default-log-port* "have-it = " have-it) (if (not have-it) - (sqlite3:execute db setqry (or valnum val) run-id)))))) + (begin + (debug:print 0 *default-log-port* "Do sqlite3:execute") + ;; (sqlite3:execute db setqry (or valnum val) run-id) + ) + ) + ) + ) + (debug:print 0 *default-log-port* "Done with update") + ) + (debug:print 0 *default-log-port* "next keyval") + ) run-meta))) run-id)))) -(define (db:create-initial-run-record dbstruct runname target) +(define (db:create-initial-run-record dbstruct run-id runname target) (let* ((keys (db:get-keys dbstruct)) (targvals (string-split target "/")) (keystr (string-intersperse keys ",")) - (key?str (string-intersperse (make-list (length targvals) "?") ",")) - (qrystr (conc "INSERT INTO runs (runname,"keystr") VALUES (?,"key?str")"))) + (key?str (string-intersperse (make-list (length targvals) "?") ",")) ;; a string with the same length as targvals, where each element is "?" and interspersed with commas. + (qrystr (conc "INSERT INTO runs (id,runname,"keystr") VALUES (?,?,"key?str")"))) + (debug:print 0 *default-log-port* "db:create-initial-run-record") + (debug:print 0 *default-log-port* "qrystr = " qrystr) + (db:with-db - dbstruct #f #t + dbstruct #f #t ;; run-id writable (lambda (dbdat db) - (apply sqlite3:execute db qrystr runname targvals))))) + (debug:print 0 *default-log-port* "lambda proc: dbdat: " dbdat " db: " db) + (apply sqlite3:execute db qrystr run-id runname targvals))))) (define (db:insert-test dbstruct run-id test-rec) (let* ((testname (alist-ref "testname" test-rec equal?)) (item-path (alist-ref "item_path" test-rec equal?)) (id (db:get-test-id dbstruct run-id testname item-path)) @@ -1484,11 +1551,11 @@ (conc (car dat)"=?")) fieldvals) ",")" WHERE id=?;")) (insqry (conc "INSERT INTO tests ("(string-intersperse (map (lambda (x) (car x)) fieldvals) ",") ") VALUES ("(string-intersperse (make-list (length fieldvals) "?") ",")");"))) - (debug:print 0 *default-log-port* "id: "id"\nset: "setqry"\ninsqry: "insqry) + ;; (debug:print 0 *default-log-port* "id: "id"\nset: "setqry"\ninsqry: "insqry) (db:with-db dbstruct run-id #t (lambda (dbdat db) (if id @@ -1580,11 +1647,11 @@ ;; TODO: Switch this to use max(update_time) from each run db? Then if using a server there is no disk traffic (using cachedb db) ??? ;; ;; NOTE: This DOESN'T (necessarily) get the real run ids, but the number of the .db!! (define (db:get-changed-run-ids since-time) - (let* ((dbdir (db:dbfile-path)) ;; (configf:lookup *configdat* "setup" "dbdir")) + (let* ((dbdir (common:make-tmpdir-name *toppath* "")) ;; (configf:lookup *configdat* "setup" "dbdir")) (alldbs (glob (conc *toppath* "/.mtdb/[0-9]*.db*"))) (changed (filter (lambda (dbfile) (> (file-modification-time dbfile) since-time)) alldbs))) (delete-duplicates @@ -2234,21 +2301,24 @@ qry run-id (or last-update 0)))))) (define (db:get-testinfo-state-status dbstruct run-id test-id) - (let ((res #f)) - (db:with-db dbstruct run-id #f - (lambda (dbdat db) - (sqlite3:for-each-row - (lambda (run-id testname item-path state status) - ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment - (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) - db - "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" - test-id run-id))) - res)) + (db:with-db + dbstruct run-id #f + (lambda (dbdat db) + (let* ((res #f) + (stmth (db:get-cache-stmth dbdat db "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;"))) + (sqlite3:for-each-row + (lambda (run-id testname item-path state status) + ;; id,run_id,testname,state,status,event_time,host,cpuload,diskfree,uname,rundir,item_path,run_duration,final_logf,comment + (set! res (vector test-id run-id testname state status -1 "" -1 -1 "" "-" item-path -1 "-" "-"))) + ;; db + ;; "SELECT run_id,testname,item_path,state,status FROM tests WHERE id=? and run_id=?;" + stmth + test-id run-id) + res)))) ;; get a useful subset of the tests data (used in dashboard ;; use db:mintest-get-{id ,run_id,testname ...} ;; (define (db:get-tests-for-run-mindata dbstruct run-id testpatt states statuses not-in) @@ -2279,25 +2349,37 @@ dbstruct run-id #t (lambda (dbdat db) (sqlite3:execute db "UPDATE tests SET state='DELETED',status='n/a',comment='' WHERE id=?;" test-id)))) ;; -(define (db:delete-old-deleted-test-records dbstruct) - (let ((targtime (- (current-seconds) - (or (configf:lookup-number *configdat* "setup" "keep-deleted-records") - (* 30 24 60 60))))) ;; one month in the past +(define (db:delete-old-deleted-test-records dbstruct run-id) + (let* ((targtime (- (current-seconds) + (or (configf:lookup-number *configdat* "setup" "keep-deleted-records") + (* 7 24 60 60)))) ;; cleanup if over one week old + (mtdbfile (dbmod:run-id->full-dbfname dbstruct run-id)) + (qry1 "DELETE FROM test_steps WHERE test_id IN (SELECT id FROM tests WHERE state='DELETED' AND event_timealist res))))) +;; testmeta doesn't change, we can cache it for up too an hour + +(define *db:testmeta-cache* (make-hash-table)) +(define *db:testmeta-last-update* 0) + ;; read the record given a testname (define (db:testmeta-get-record dbstruct testname) - (let ((res #f)) - (db:with-db - dbstruct - #f - #f - (lambda (dbdat db) - (sqlite3:for-each-row - (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup) - (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup))) - db - "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;" - testname) - res)))) + (if (and (< (- (current-seconds) *db:testmeta-last-update*) 600) + (hash-table-exists? *db:testmeta-cache* testname)) + (hash-table-ref *db:testmeta-cache* testname) + (let ((res #f)) + (db:with-db + dbstruct + #f + #f + (lambda (dbdat db) + (sqlite3:for-each-row + (lambda (id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup) + (set! res (vector id testname author owner description reviewed iterated avg_runtime avg_disk tags jobgroup))) + db + "SELECT id,testname,author,owner,description,reviewed,iterated,avg_runtime,avg_disk,tags,jobgroup FROM test_meta WHERE testname=?;" + testname))) + (hash-table-set! *db:testmeta-cache* testname res) + (set! *db:testmeta-last-update* (current-seconds)) + res))) ;; create a new record for a given testname (define (db:testmeta-add-record dbstruct testname) (db:with-db dbstruct #f #t (lambda (dbdat db) @@ -4314,11 +4406,11 @@ )))) ;; sync for filesystem local db writes ;; (define (db:run-lock-and-sync no-sync-db) - (let* ((tmp-area (common:get-db-tmp-area)) + (let* ((tmp-area (common:make-tmpdir-name *toppath* "")) (dbfiles (glob (conc tmp-area"/.mtdb/*.db"))) (sync-durations (make-hash-table))) ;; (debug:print-info 0 *default-log-port* "lock-and-sync, dbfiles: "dbfiles) (for-each (lambda (file) @@ -4370,11 +4462,11 @@ (sync-stale-seconds (configf:lookup-number *configdat* "server" "sync-stale-seconds" default: 300)) (debug-mode (debug:debug-mode 1)) (last-time (current-seconds)) ;; last time through the sync loop (no-sync-db (db:open-no-sync-db)) (sync-duration 0) ;; run time of the sync in milliseconds - (tmp-area (common:get-db-tmp-area))) + (tmp-area (common:make-tmpdir-name *toppath* ""))) ;; Sync moved to http-transport keep-running loop (debug:print-info 2 *default-log-port* "Periodic copy-based sync thread started. syncer is copy-sync, tmp-area is " tmp-area) (debug:print-info 3 *default-log-port* "watchdog starting. syncer is copy-sync pid="(current-process-id));; " this-wd-num="this-wd-num) (if (and legacy-sync (not *time-to-exit*)) @@ -4478,11 +4570,11 @@ (for-each (lambda (subdb) (let* (;;(dbstruct (db:setup)) (mtdb (dbr:subdb-mtdb subdb)) (mtpath (db:dbdat-get-path mtdb)) - (tmp-area (common:get-db-tmp-area)) + (tmp-area (common:make-tmpdir-name *toppath* "")) (res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive (set! sync-duration (- (current-milliseconds) sync-start)) (if (> res 0) ;; some records were transferred, keep the db alive (begin (mutex-lock! *heartbeat-mutex*) @@ -4526,11 +4618,10 @@ ;; (db:no-sync-close-db no-sync-db stmt-cache) (if (common:low-noise-print 30) (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id) )))) )) - (define (std-exit-procedure) ;;(common:telemetry-log-close) (on-exit (lambda () 0)) ;; why is this here? ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) (let ((no-hurry (if *time-to-exit* ;; hurry up @@ -4539,11 +4630,11 @@ (set! *time-to-exit* #t) #t)))) (debug:print-info 4 *default-log-port* "starting exit process, finalizing databases.") (if (and no-hurry (debug:debug-mode 18)) - (rmt:print-db-stats)) + (dbmod:print-db-stats)) (let ((th1 (make-thread (lambda () ;; thread for cleaning up, give it five seconds (if *dbstruct-dbs* (db:close-all *dbstruct-dbs*)) ;; one second allocated (if (list? *on-exit-procs*) (for-each (lambda (proc) Index: dbfile.scm ================================================================== --- dbfile.scm +++ dbfile.scm @@ -24,13 +24,16 @@ (declare (uses debugprint)) (declare (uses commonmod)) (module dbfile * - - (import scheme - chicken +(import scheme) + +(cond-expand + (chicken-4 + + (import chicken data-structures extras matchable (prefix sqlite3 sqlite3:) @@ -45,11 +48,54 @@ hostinfo commonmod debugprint ) + ) + (chicken-5 + (import (prefix sqlite3 sqlite3:) + ;; data-structures + ;; extras + ;; files + ;; posix + ;; posix-extras + chicken.base + chicken.condition + chicken.file + chicken.file.posix + chicken.format + chicken.io + chicken.pathname + chicken.port + chicken.process + chicken.process-context + chicken.process-context.posix + chicken.sort + chicken.string + chicken.time + chicken.time.posix + + matchable + md5 + message-digest + pathname-expand + regex + regex-case + srfi-1 + srfi-18 + srfi-69 + typed-records + stack + system-information + commonmod + debugprint + ) + (define file-write-access? file-writable?) + (define file-move move-file) + )) + ;; parameters ;; (define dbfile:testsuite-name (make-parameter #f)) (define keep-age-param (make-parameter 10)) ;; qif file age, if over move to attic @@ -242,11 +288,12 @@ #f ) ) (define (dbfile:make-tmpdir-name areapath tmpadj) - (let* ((dname (conc "/tmp/"(current-user-name)"/" (string-translate areapath "/" ".") tmpadj))) + (let* ((area (pathname-file areapath)) + (dname (conc "/tmp/"(current-user-name)"/megatest_localdb/" area "/" (string-translate areapath "/" ".") tmpadj "/.mtdb"))) (unless (directory-exists? dname) (create-directory dname #t)) dname)) (define (dbfile:run-id->path apath run-id) @@ -267,18 +314,18 @@ (define (dbfile:run-id->dbfname run-id) (conc (dbfile:run-id->dbnum run-id)".db")) ;; the path in MTRAH with the filename (define (dbfile:run-id->dbname run-id) - (conc ".mtdb/"(dbfile:run-id->dbfname run-id))) + (conc (dbfile:run-id->dbfname run-id))) ;; Make the dbstruct, setup up auxillary db's and call for main db at least once ;; ;; called in http-transport and replicated in rmt.scm for *local* access. ;; -(define (dbfile:setup do-sync areapath tmppath) +(define (dbfile:setup areapath tmppath) (cond (*dbstruct-dbs* (dbfile:print-err "WARNING: dbfile:setup called when *dbstruct-dbs* is already initialized") *dbstruct-dbs*) ;; TODO: when multiple areas are supported, this optimization will be a hazard (else @@ -358,11 +405,12 @@ (if dbdat dbdat (let* ((tmppath (dbr:dbstruct-tmppath dbstruct)) (tmpdbpath (dbfile:run-id->path tmppath run-id)) (dbdat (dbfile:open-sqlite3-db tmpdbpath init-proc sync-mode: 0 journal-mode: "WAL"))) - ;; the following line short-circuits the "one db handle per thread" model + + ;; the following line short-circuits the "one db handle per thread" model ;; ;; (dbfile:add-dbdat dbstruct run-id dbdat) ;; dbdat)))))) @@ -445,19 +493,20 @@ (if journal-mode (sqlite3:execute db (conc "PRAGMA journal_mode = "journal-mode";"))) (if (and init-proc (or force-init (not db-exists))) (init-proc db)) - db))) + db)) + expire-time: 5) (begin (if (file-exists? fname ) (let ((db (sqlite3:open-database fname))) ;; pragmas synchronous not needed because this db is used read-only ;; (sqlite3:execute db (conc "PRAGMA synchronous = "mode";") (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 30000)) ;; read-only but still need timeout db ) - (print "file doesn't exist: " fname)))) + (print "cautious-open-database: file doesn't exist: " fname)))) (exn (io-error) (dbfile:print-err exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again.") (retry)) (exn (corrupt) (dbfile:print-err exn "ERROR: database " fname " is corrupt. Repair it to proceed.") @@ -487,11 +536,11 @@ ;; NOTE: this is already protected by mutex *no-sync-db-mutex* ;; (define (dbfile:raw-open-no-sync-db dbpath) (if (not (file-exists? dbpath)) (create-directory dbpath #t)) - (debug:print-info 0 *default-log-port* "Opening "dbpath"/no-sync.db") + (debug:print-info 2 *default-log-port* "(dbfile:raw-open-no-sync-db: Opening "dbpath"/no-sync.db") (let* ((dbname (conc dbpath "/no-sync.db")) (db-exists (file-exists? dbname)) (init-proc (lambda (db) (sqlite3:with-transaction db @@ -525,18 +574,19 @@ reason TEXT DEFAULT 'none', CONSTRAINT no_sync_processes UNIQUE (host,pid));" )))))) (on-tmp (equal? (car (string-split dbpath "/")) "tmp")) (db (if on-tmp - (dbfile:cautious-open-database dbname init-proc 0 "WAL" force-init: #t) - (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t) + (dbfile:cautious-open-database dbname init-proc 1 "WAL" force-init: #t) ;; WAL MODE should use syncronous=1 + ;; (dbfile:cautious-open-database dbname init-proc 0 #f force-init: #t) + (dbfile:cautious-open-database dbname init-proc 0 "MEMORY" force-init: #t) ;; Journal mode = memory is fastest? ;; (sqlite3:open-database dbname) ))) - (if on-tmp ;; done in cautious-open-database - (begin - (sqlite3:execute db "PRAGMA synchronous = 0;") - (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)))) + ;; (if on-tmp ;; done in cautious-open-database + ;; (begin + ;; (sqlite3:execute db "PRAGMA synchronous = 0;") ;; why was this here when is is handled by cautious-open-database? + (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) ;; )) db)) ;; mtest processes registry calls (define (dbfile:insert-or-update-process nsdb dat) @@ -580,18 +630,20 @@ host port pid starttime endtime status purpose dbname mtversion)) (define (dbfile:set-process-status nsdb host pid newstatus) (sqlite3:execute nsdb "UPDATE processes SET status=? WHERE host=? AND pid=?;" newstatus host pid)) +;; as sorted should be stable. can use to choose "winner" +;; (define (dbfile:get-process-options nsdb purpose dbname) (sqlite3:fold-row ;; host port pid starttime status mtversion (lambda (res . row) (cons row res)) '() nsdb - "SELECT host,port,pid,starttime,endtime,status,mtversion FROM processes WHERE purpose=? AND dbname LIKE ? AND status='alive';" + "SELECT host,port,pid,starttime,endtime,status,mtversion FROM processes WHERE purpose=? AND dbname LIKE ? AND status IN ('running','alive') ORDER BY starttime ASC,host,port;" purpose dbname)) (define (dbfile:get-process-info nsdb host pid) (let ((res (sqlite3:fold-row ;; host port pid starttime status mtversion @@ -602,17 +654,25 @@ "SELECT host,port,pid,starttime,endtime,status,purpose,dbname,mtversion FROM processes WHERE host=? AND pid=?;" host pid))) (if (null? res) #f (car res)))) + +(define (dbfile:row->procinf row) + (match row + ((host port pid starttime endtime status mtversion) + (make-procinf host: host port: port pid: pid starttime: starttime endtime: endtime status: status mtversion: mtversion)) + (else + (debug:print 0 *default-log-port* "ERROR: row "row" did not match host,port,pid,starttime,endtime,status,mtversion") + #f))) (define (dbfile:set-process-done nsdb host pid reason) - (sqlite3:execute nsdb "UPDATE processes SET status='ended',endtime=?,reason=? WHERE host=? AND pid=?;" (current-seconds) reason host pid) + (sqlite3:execute nsdb "UPDATE processes SET status='done',endtime=?,reason=? WHERE host=? AND pid=?;" (current-seconds) reason host pid) (dbfile:cleanup-old-entries nsdb)) (define (dbfile:cleanup-old-entries nsdb) - (sqlite3:execute nsdb "DELETE FROM process WHERE status='ended' AND endtimetimestamp, identifier ((timestamp . ident) (cons (equal? ident identifier) timestamp)) - (else (cons #f 'malformed-lock))) ;; lock malformed + (else + (debug:print 2 *default-log-port* "db:no-sync-get-lock-with-id: malformed lock") + (cons #f 'malformed-lock) + ) + ) ;; lock malformed (let ((curr-sec (current-seconds)) (lock-value (if identifier (conc (current-seconds)"+"identifier) (current-seconds)))) (sqlite3:execute db "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-value) @@ -1572,7 +1638,15 @@ ;; (db:hoh-set! stmt-cache db stmt newstmth) (hash-table-set! stmt-cache stmt newstmth) newstmth)))) (mutex-unlock! *get-cache-stmth-mutex*) result)) + +;; (define *mutex-stmth-call* (make-mutex)) +;; +;; (define (db:with-mutex-for-stmth proc) +;; (mutex-lock! *mutex-stmth-call*) +;; (let* ((res (proc))) +;; (mutex-unlock! *mutex-stmth-call*) +;; res)) ) Index: dbmod.scm ================================================================== --- dbmod.scm +++ dbmod.scm @@ -25,28 +25,48 @@ (declare (uses debugprint)) (module dbmod * -(import scheme - chicken - data-structures - extras - files +(import scheme) + +(cond-expand + (chicken-4 + (import chicken + data-structures + extras + files + + posix + + )) + (chicken-5 + (import chicken.base + chicken.condition + chicken.file + chicken.pathname + chicken.process + chicken.sort + chicken.string + chicken.time + + ) + (define file-read-access? file-readable?) + (define file-copy copy-file) + )) +(import format (prefix sqlite3 sqlite3:) matchable - posix typed-records srfi-1 srfi-18 srfi-69 commonmod dbfile - debugprint - ) + debugprint) ;; NOTE: This returns only the name "1.db", "main.db", not the path ;; (define (dbmod:run-id->dbfname run-id) (conc (dbfile:run-id->dbnum run-id)".db")) @@ -58,15 +78,12 @@ (not (file-exists? dbdir))) (create-directory dbdir)) dbdir)) (define (dbmod:run-id->full-dbfname dbstruct run-id) - (conc (dbmod:get-dbdir dbstruct - - run-id - - )"/"(dbmod:run-id->dbfname run-id))) + (conc (dbmod:get-dbdir dbstruct) + "/"(dbmod:run-id->dbfname run-id))) ;;====================================================================== ;; Read-only cachedb cached direct from disk method ;;====================================================================== @@ -87,19 +104,19 @@ ;; The cachedb one-db file per server method goes in here ;;====================================================================== ;; NOTE: the r/w is now w/r, #t=db modified by query, #f=db NOT modified by query (define (dbmod:with-db dbstruct run-id w/r proc params) - (let* ((use-mutex (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk - (> *api-process-request-count* 5)) ;; when writes are happening throttle more - (> *api-process-request-count* 50))) + (let* ((use-mutex w/r) ;; (or (and w/r ;; use the mutex on queries that modify the db and for sync to disk + ;; (> *api-process-request-count* 5)) ;; when writes are happening throttle more + ;; (> *api-process-request-count* 50))) (dbdat (dbmod:open-db dbstruct run-id (dbfile:db-init-proc))) (dbh (dbr:dbdat-dbh dbdat)) ;; this will be the cachedb handle (dbfile (dbr:dbdat-dbfile dbdat))) ;; if nfs mode do a sync if delta > 2 - (let* ((last-update (dbr:dbstruct-last-update dbstruct)) - (sync-proc (dbr:dbstruct-sync-proc dbstruct)) + #;(let* ((last-update (dbr:dbstruct-last-update dbstruct)) + ;; (sync-proc (dbr:dbstruct-sync-proc dbstruct)) (curr-secs (current-seconds))) (if (> (- curr-secs last-update) 5) (begin (sync-proc last-update) @@ -119,11 +136,11 @@ (loop (- count 1))) (begin (debug:print-info 0 *default-log-port* "dbmod:with-db, database is busy, giving up.") (exit 1)))) (exn () - (dbfile:print-err exn "ERROR: Unknown error with database for run-id "run-id", message: " + (dbfile:print-err exn "ERROR: dbmod:with-db: Unknown error with database for run-id "run-id", message: " ((condition-property-accessor 'exn 'message) exn)) (exit 2)))))) (if use-mutex (mutex-unlock! *db-with-db-mutex*)) res))) @@ -198,11 +215,11 @@ (let* ((dbstruct (or dbstruct-in (make-dbr:dbstruct areapath: areapath))) (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) (dbpath (dbmod:get-dbdir dbstruct)) ;; directory where all the .db files are kept (dbfullname (conc dbpath"/"dbfname)) ;; (dbmod:run-id->full-dbfname dbstruct run-id)) (dbexists (file-exists? dbfullname)) - (tmpdir (dbfile:make-tmpdir-name areapath tmpadj)) + (tmpdir (common:make-tmpdir-name areapath tmpadj)) (tmpdb (let* ((fname (conc tmpdir"/"dbfname))) fname)) (cachedb (dbmod:open-cachedb-db init-proc ;; (if (eq? (dbfile:cache-method) 'cachedb) ;; #f @@ -224,51 +241,22 @@ (dbr:dbstruct-dbtmpname-set! dbstruct tmpdb) (dbr:dbstruct-dbfname-set! dbstruct dbfname) (dbr:dbstruct-sync-proc-set! dbstruct (lambda (last-update) (if *sync-in-progress* - (debug:print 3 *default-log-port* "WARNING: overlapping calls to sync to disk") - (let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log")) - (sync-cmd (if (eq? syncdir 'todisk) - (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&") - (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&"))) - (synclock-file (conc dbfullname".lock")) - (syncer-running-file (conc dbfullname"-sync-running")) - (synclock-mod-time (if (file-exists? synclock-file) - (handle-exceptions - exn - #f - (file-modification-time synclock-file)) - #f)) - (thethread (lambda () - (thread-start! - (make-thread - (lambda () - (set! *sync-in-progress* #t) - (debug:print-info "Running "sync-cmd) - (if (file-exists? syncer-running-file) - (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.") - (system sync-cmd)) - (set! *sync-in-progress* #f))))))) - (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk - (file-modification-time tmpdb) - (file-modification-time dbfullname)) - (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname) - (if synclock-mod-time - (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file - (begin - (handle-exceptions - exn - #f - (begin - (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it") - (delete-file synclock-file) - ) - ) - (thethread)) - (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found.")) - (thethread))))))) + (debug:print 0 *default-log-port* "WARNING: overlapping calls to sync to disk") + (begin + ;; turn off writes - send busy or block? + ;; call db2db internally + ;; turn writes back on + ;; + (set! *api-halt-writes* #t) ;; do we need a mutex? + ;; (dbmod:db-to-db-sync src-db dest-db last-update (dbfile:db-init-proc) keys) + (debug:print-info 2 *default-log-port* "Internal sync running from "tmpdb" to "dbfullname) + (dbmod:db-to-db-sync tmpdb dbfullname last-update (dbfile:db-init-proc) keys) + (set! *api-halt-writes* #f) + )))) ;; (dbmod:sync-tables tables #f db cachedb) ;; (thread-sleep! 1) ;; let things settle before syncing in needed data (dbmod:sync-gasket tables #f cachedb db dbfullname 'fromdest keys) ;; ) ;; load into cachedb (dbr:dbstruct-last-update-set! dbstruct (+ (current-seconds) -10)) ;; should this be offset back in time by one second? @@ -474,47 +462,63 @@ (set! has-last #t))) dbh (conc "SELECT name FROM pragma_table_info('"tablename"') as tblInfo;")) has-last)) +(define (replace-question-marks-with-number str num) + (define (replace-helper str index result) + (if (>= index (string-length str)) + result + (let ((char (string-ref str index))) + (if (char=? char #\?) + (replace-helper str (+ index 1) (string-append result (number->string num))) + (replace-helper str (+ index 1) (string-append result (string char))))))) + + (replace-helper str 0 "")) + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; tbls is ( ("tablename" ( "field1" [#f|proc1] ) ( "field2" [#f|proc2] ) .... ) ) ;; ;; direction = fromdest, todisk ;; mode = 'full, 'incr ;; ;; Idea: youngest in dest is last_update time -;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + (define (dbmod:attach-sync tables dbh destdbfile direction #!key (mode 'full) (no-update '("keys")) ;; do ) - (let* ((num-changes 0) + (debug:print-info 2 *default-log-port* "dbmod:attach-sync") + (let* ((num-changes 0) (update-changed (lambda (num-changed table qryname) (if (> num-changed 0) (begin (debug:print-info 0 *default-log-port* "Changed "num-changed" rows for table "table", qry "qryname) (set! num-changes (+ num-changes num-changed))))))) - (debug:print 0 *default-log-port* "Doing sync "direction" "destdbfile) + (debug:print 2 *default-log-port* "Doing sync "direction" "destdbfile) (if (not (sqlite3:auto-committing? dbh)) (debug:print 0 *default-log-port* "Skipping sync due to transaction in flight.") (let* ((table-names (map car tables)) (dest-exists (file-exists? destdbfile))) (assert dest-exists "FATAL: sync called with non-existant file, "destdbfile) ;; attach the destdbfile ;; for each table ;; insert into dest. select * from src.
where last_update>last_update ;; done - (debug:print 0 *default-log-port* "Attaching "destdbfile" as auxdb") + (debug:print 2 *default-log-port* "Attaching "destdbfile" as auxdb") (handle-exceptions exn (begin (debug:print 0 "ATTACH failed, exiting. exn="(condition->list exn)) (exit 1)) (sqlite3:execute dbh (conc "ATTACH '"destdbfile"' AS auxdb;"))) (for-each (lambda (table) - (let* ((tbldat (alist-ref table tables equal?)) + (let* ((dummy (debug:print 2 *default-log-port* "Doing table " table)) + (tbldat (alist-ref table tables equal?)) (fields (map car tbldat)) (no-id-fields (filter (lambda (x)(not (equal? x "id"))) fields)) (fields-str (string-intersperse fields ",")) (no-id-fields-str (string-intersperse no-id-fields ",")) (dir (eq? direction 'todisk)) @@ -529,27 +533,38 @@ " SELECT * FROM "fromdb table";")) (stmt2 (conc "INSERT OR IGNORE INTO "todb table " SELECT * FROM "fromdb table" WHERE "fromdb table".id=?;")) (stmt8 (conc "UPDATE "todb table" SET ("no-id-fields-str") = (SELECT "no-id-fields-str" FROM "fromdb table" WHERE "todb table".id="fromdb table".id" (conc " AND "fromdb table".last_update > "todb table".last_update);") - ");")) - (stmt9 (conc "UPDATE "todb table" SET ("no-id-fields-str") = " + ");")) + (update-string (conc "UPDATE "todb table" SET ")) + (split-update + (let () + (for-each + (lambda (column) + (set! update-string (conc update-string column" = (SELECT "column" FROM "fromdb table" WHERE "fromdb table".id=?), ")) + ) + no-id-fields + ) + ;; drop the last ", " + (conc (substring update-string 0 (-(string-length update-string) 2)) " WHERE "todb table".id=? ") + ) + ) + + + (stmt9 (conc "UPDATE "todb table" SET ("no-id-fields-str") = " "(SELECT "no-id-fields-str" FROM "fromdb table" WHERE "fromdb table".id=?)" " WHERE "todb table".id=?")) (newrec (conc "SELECT id FROM "fromdb table" WHERE id NOT IN (SELECT id FROM "todb table");")) - #;(changedrec (conc "SELECT id FROM "fromdb table" WHERE "fromdb table".last_update > "todb table".last_update AND " - fromdb table".id="todb table".id;")) ;; main = fromdb (changedrec (conc "SELECT "fromdb table".id FROM "fromdb table" join "todb table" on "fromdb table".id="todb table".id WHERE "fromdb table".last_update > "todb table".last_update;")) - ;; SELECT main.tests.id FROM main.tests join auxdb.tests on main.tests.id=auxdb.tests.id WHERE main.tests.last_update > auxdb.tests.last_update;" (start-ms (current-milliseconds)) (new-ids (sqlite3:fold-row (lambda (res id)(cons id res)) '() dbh newrec))) - ;; (debug:print 0 *default-log-port* "Got "(length aux-ids)" in aux-ids and "(length main-ids)" in main-ids") (update-changed (length new-ids) table "new records") (mutex-lock! *db-transaction-mutex*) (handle-exceptions exn - (debug:print 0 *default-log-port* "Transaction update of "table" failed.") + (debug:print 0 *default-log-port* "Transaction update of id fields in "table" failed.") (sqlite3:with-transaction dbh (lambda () (for-each (lambda (id) (sqlite3:execute dbh stmt2 id)) @@ -556,23 +571,40 @@ new-ids)))) (if (member "last_update" fields) (handle-exceptions exn - (debug:print 0 *default-log-port* "Transaction update of "table" failed.") + (debug:print 0 *default-log-port* "Transaction update of non id fields in "table" failed.") (sqlite3:with-transaction dbh (lambda () - (let* ((changed-ids (sqlite3:fold-row (lambda (res id)(cons id res)) '() dbh changedrec))) + (let* ((changed-ids (sqlite3:fold-row (lambda (res id)(cons id res)) '() dbh changedrec)) + (sql-query "") + ) (update-changed (length changed-ids) table "changed records") (for-each (lambda (id) - (sqlite3:execute dbh stmt9 id id)) - changed-ids)))))) - + (let* ((update-with-ids (replace-question-marks-with-number split-update id)) + ) + (debug:print 2 *default-log-port* "about to do sqlite3:execute " dbh " " update-with-ids ) + (handle-exceptions + exn + (debug:print 0 *default-log-port* "update from " fromdb table " to " todb table " failed: " ((condition-property-accessor 'exn 'message) exn)) + (sqlite3:execute dbh update-with-ids) + ) + (debug:print 2 *default-log-port* "after sqlite3:execute") + ) + ) + changed-ids + ) + ) + ) + ) + ) + ) (mutex-unlock! *db-transaction-mutex*) - (debug:print 0 *default-log-port* "Synced table "table + (debug:print 2 *default-log-port* "Synced table "table " in "(- (current-milliseconds) start-ms)"ms") )) table-names) (sqlite3:execute dbh "DETACH auxdb;"))) @@ -627,11 +659,11 @@ (debug:print 0 *default-log-port* "stmt3="stmt3) (if (sqlite3:auto-committing? dbh1) (begin (handle-exceptions exn - (debug:print 0 *default-log-port* "Transaction update of "table" failed.") + (debug:print 0 *default-log-port* "Transaction update of "table" failed. "(condition->list exn)) (sqlite3:with-transaction dbh1 (lambda () (sqlite3:execute dbh1 stmt1) ;; get all new rows @@ -856,6 +888,94 @@ (res (dbmod:sync-gasket tables last-update sdb ddb dest-db 'todisk keys))) (sqlite3:finalize! sdb) (sqlite3:finalize! ddb) res))) #f)) + +;; ====================================================================== +;; dbstats +;;====================================================================== + +;; (define *dbstruct-dbs* #f) ;; used to cache the dbstruct in db:setup. Goal is to remove this. +;; db stats +(define *db-stats* (make-hash-table)) ;; hash of vectors < count duration-total > +(define *db-stats-mutex* (make-mutex)) + +(define (dbmod:print-db-stats) + (let ((fmtstr "~40a~8-d~20-d~20,2-f")) ;; "~20,2-f" + (debug:print 0 *default-log-port* "DB Stats\n========") + (debug:print 0 *default-log-port* (format #f "~40a~8a~20a~10a" "Cmd" "Count" "TotTime" "Avg")) + (for-each (lambda (cmd) + (let* ((dat (hash-table-ref *db-stats* cmd)) + (count (dbstat-cnt dat)) + (tottime (dbstat-tottime dat))) + (debug:print 0 *default-log-port* + (format #f fmtstr cmd count tottime + (/ tottime count))))) + (sort (hash-table-keys *db-stats*) + (lambda (a b) + (> (dbstat-tottime (hash-table-ref *db-stats* a)) + (dbstat-tottime (hash-table-ref *db-stats* b)))))))) + +(defstruct dbstat + (cnt 0) + (tottime 0)) + +(define (db:add-stats cmd run-id params delta) + (let* ((modified-cmd (if (eq? cmd 'general-call) + (string->symbol (conc "general-call-" (car params))) + cmd)) + (rec (hash-table-ref/default *db-stats* modified-cmd #f))) + (if (not rec) + (let ((new-rec (make-dbstat))) + (hash-table-set! *db-stats* modified-cmd new-rec) + (set! rec new-rec))) + (dbstat-cnt-set! rec (+ (dbstat-cnt rec) 1)) + (dbstat-tottime-set! rec (+ (dbstat-tottime rec) delta)))) + + + ) + + +;; ATTIC + + #;(let* ((syncer-logfile (conc areapath"/logs/"dbfname"-syncer.log")) + (sync-cmd (if (eq? syncdir 'todisk) + (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "tmpdb" -to "dbfullname" -period 5 -timeout 10 > /dev/null 2&>1)&") + (conc "(NBFAKE_LOG="syncer-logfile" nbfake megatest -db2db -from "dbfullname" -to "tmpdb" -period 5 -timeout 10 > /dev/null 2&>1)&"))) + (synclock-file (conc dbfullname".lock")) + (syncer-running-file (conc dbfullname"-sync-running")) + (synclock-mod-time (if (file-exists? synclock-file) + (handle-exceptions + exn + #f + (file-modification-time synclock-file)) + #f)) + (thethread (lambda () + (thread-start! + (make-thread + (lambda () + (set! *sync-in-progress* #t) + (debug:print-info "Running "sync-cmd) + (if (file-exists? syncer-running-file) + (debug:print-info 0 *default-log-port* "Syncer still running, skipping syncer start.") + (system sync-cmd)) + (set! *sync-in-progress* #f))))))) + (if ((if (eq? syncdir 'todisk) < >) ;; use less than for todisk, greater than for from disk + (file-modification-time tmpdb) + (file-modification-time dbfullname)) + (debug:print 4 *default-log-port* "Skipping sync, "tmpdb" older than "dbfullname) + (if synclock-mod-time + (if (> (- (current-seconds) synclock-mod-time) 20) ;; something wrong with sync, remove file + (begin + (handle-exceptions + exn + #f + (begin + (debug:print 0 *default-log-port* "Sync lock file " synclock-file "is older than 20 seconds (" synclock-mod-time " seconds). Removing it") + (delete-file synclock-file) + ) + ) + (thethread)) + (debug:print 0 *default-log-port* "Skipping sync, lockfile "synclock-file" found.")) + (thethread)))) Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -59,10 +59,11 @@ update-mutex updaters updating uidat ;; needs to move to tabdat at some time hide-not-hide-tabs + target ) (define (dboard:commondat-make) (make-dboard:commondat curr-tab-num: 0 @@ -70,19 +71,41 @@ please-update: #t update-mutex: (make-mutex) updaters: (make-hash-table) updating: #f hide-not-hide-tabs: #f + target: "" )) ;; RADT => Matrix defstruct addition (defstruct dboard:graph-dat ((id #f) : string) ((color #f) : vector) ((flag #t) : boolean) ((cell #f) : number) ) + +;; RA => returns the tabdat stored at hashkey passed in commondat-tabdats table (e.g. 0 gives summary) +;; +(define (dboard:common-get-tabdat commondat #!key (tab-num #f)) + (let* ((tnum (or tab-num + (dboard:commondat-curr-tab-num commondat) + 0)) ;; tab-num value is curr-tab-num value in passed commondat + (ht (dboard:commondat-tabdats commondat)) + (res (hash-table-ref/default ht tnum #f))) + (or res + (let ((new-tabdat (dboard:tabdat-make-data))) + (hash-table-set! ht tnum new-tabdat) + new-tabdat)))) + +;; RA => sets the tabdat passed to the hashkey at commondat:tabdats hash table +;; +(define (dboard:common-set-tabdat! commondat tabnum tabdat) + (hash-table-set! + (dboard:commondat-tabdats commondat) + tabnum + tabdat)) ;; data for runs, tests etc. was used in run summary? ;; (defstruct dboard:runsdat ;; new system @@ -1138,14 +1161,28 @@ #:readonly "YES" #:font "Courier New, -12" ))) (dboard:tabdat-command-tb-set! data tb) tb) + (iup:button "Execute" #:size "50x" #:action (lambda (obj) - ;; (let ((cmd (conc ;; "xterm -geometry 180x20 -e \"" - (common:run-a-command (iup:attribute (dboard:tabdat-command-tb data) "VALUE"))))))) + (let ((cmd (iup:attribute (dboard:tabdat-command-tb data) "VALUE"))) + (if (substring-index "no-runname-specified" cmd) + (debug:print 0 *default-log-port* "ERROR: no runname specified") + (begin + (if (substring-index "no-target-selected" cmd) + (debug:print 0 *default-log-port* "ERROR: no target selected") + (begin + (if (not (substring-index "-run" cmd)) + (debug:print 0 *default-log-port* "ERROR: No target selected") + (common:run-a-command (iup:attribute (dboard:tabdat-command-tb data) "VALUE")) + ) + ) + ) + ) + ))))))) ;; ";echo Press any key to continue;bash -c 'read -n 1 -s'\" &"))) ;; (system cmd))))))) (define (dcommon:command-action-selector commondat tabdat #!key (tab-num #f)) (iup:frame @@ -1170,11 +1207,10 @@ (let* ((default-run-name (seconds->work-week/day (current-seconds))) (tb (iup:textbox #:expand "HORIZONTAL" #:action (lambda (obj val txt) (debug:catch-and-dump (lambda () - ;; (print "obj: " obj " val: " val " unk: " unk) (dboard:tabdat-run-name-set! tabdat txt) ;; (iup:attribute obj "VALUE")) (dashboard:update-run-command tabdat)) "command-runname-selector tb action")) #:value (or default-run-name (dboard:tabdat-run-name tabdat)))) (lb (iup:listbox #:expand "HORIZONTAL" @@ -1196,11 +1232,10 @@ (runs-dat (vector-ref runs-for-targ 1)) (run-names (cons default-run-name (map (lambda (x) (db:get-value-by-header x runs-header "runname")) runs-dat)))) - ;; (print "DEBUGINFO: run-names=" run-names) ;; (iup:attribute-set! lb "REMOVEITEM" "ALL") (iuplistbox-fill-list lb run-names selected-item: default-run-name)))))) ;; (dboard:tabdat-updater-for-runs-set! tabdat refresh-runs-list) (dboard:commondat-add-updater commondat refresh-runs-list tab-num: tab-num) ;; (refresh-runs-list) Index: docs/manual/Makefile ================================================================== --- docs/manual/Makefile +++ docs/manual/Makefile @@ -37,10 +37,13 @@ # dos2unix megatest_manual.html megatest_manual.pdf : megatest_manual.txt *.txt *png *.dot a2x -a toc -f pdf megatest_manual.txt +%.pdf : %.dot + dot -Tpdf $*.dot -o$*.pdf + server.ps : server.dot dot -Tps server.dot > server.ps client.ps : client.dot dot -Tps client.dot > client.ps Index: docs/manual/megatest_manual.pdf ================================================================== --- docs/manual/megatest_manual.pdf +++ docs/manual/megatest_manual.pdf cannot compute difference between binary files Index: docs/manual/server.dot ================================================================== --- docs/manual/server.dot +++ docs/manual/server.dot @@ -12,67 +12,68 @@ // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with Megatest. If not, see . + digraph G { - subgraph cluster_1 { - node [style=filled,shape=box]; - - check_available_queue -> remove_entries_over_10s_old; - remove_entries_over_10s_old -> set_available [label="num_avail < 3"]; - remove_entries_over_10s_old -> exit [label="num_avail > 2"]; - - set_available -> delay_2s; - delay_2s -> check_place_in_queue; - - check_place_in_queue -> "http:transport-launch" [label="at head"]; - check_place_in_queue -> exit [label="not at head"]; - - "client:login" -> "server:shutdown" [label="login failed"]; - "server:shutdown" -> exit; - - subgraph cluster_2 { - "http:transport-launch" -> "http:transport-run"; - "http:transport-launch" -> "http:transport-keep-running"; - - "http:transport-keep-running" -> "tests running?"; - "tests running?" -> "client:login" [label=yes]; - "tests running?" -> "server:shutdown" [label=no]; - "client:login" -> delay_5s [label="login ok"]; - delay_5s -> "http:transport-keep-running"; - } - - // start_server -> "server_running?"; - // "server_running?" -> set_available [label="no"]; - // "server_running?" -> delay_2s [label="yes"]; - // delay_2s -> "still_running?"; - // "still_running?" -> ping_server [label=yes]; - // "still_running?" -> set_available [label=no]; - // ping_server -> exit [label=alive]; - // ping_server -> remove_server_record [label=dead]; - // remove_server_record -> set_available; - // set_available -> avail_delay [label="delay 3s"]; - // avail_delay -> "first_in_queue?"; - // - // "first_in_queue?" -> set_running [label=yes]; - // set_running -> get_next_port -> handle_requests; - // "first_in_queue?" -> "dead_entry_in_queue?" [label=no]; - // "dead_entry_in_queue?" -> "server_running?" [label=no]; - // "dead_entry_in_queue?" -> "remove_dead_entries" [label=yes]; - // remove_dead_entries -> "server_running?"; - // - // handle_requests -> start_shutdown [label="no traffic\nno running tests"]; - // handle_requests -> shutdown_request; - // start_shutdown -> shutdown_delay; - // shutdown_request -> shutdown_delay; - // shutdown_delay -> exit; - - label = "server:launch"; - color=brown; - } - -// client_start_server -> start_server; -// handle_requests -> read_write; -// read_write -> handle_requests; -} + label = "Server Start Sequences"; + color=brown; + rankdir="TB"; + + subgraph cluster_1 { + label="Find Prime Main Server"; + + node [style=filled,shape=box]; + + START; + HaveServ [label="Look at .servinfo\nfiles for prime main"]; + AskPrime [label="Ask Prime for main"]; + PingPrime [label="Ping Prime"]; + AskPrime [label="Ask .servinfo prime for server"]; + StartServ [label="Launch Server Process for main.db"]; + + START -> HaveServ; + HaveServ -> PingPrime; + PingPrime -> AskPrime [label="Got response"]; + PingPrime -> StartServ [label="No reponse"]; + HaveServ -> StartServ [label="No files"]; + StartServ -> "Delay 2s" -> START; + AskPrime -> DONE; + } + + subgraph cluster_2 { + label="Starting non-prime server" + node [style=filled,shape=box]; + StartTCPServer [label="Start tcp server"]; + FindPrimeMain [label="Find Prime Main Server"]; + RegisterProcessViaPrime [label="Register process via prime server"]; + + StartTCPServer -> FindPrimeMain -> START; + DONE -> RegisterProcessViaPrime -> READY; + } + + subgraph cluster_3 { + label="Start Prime Main" + node [style=filled,shape=box]; + StartTCPServer_prime [label="Start tcp server"]; + GetServInfoFiles [label="Get servinfo files"]; + CreateServInfoFile [label="Create servinfo file"]; + RegisterProcess [label="Register process in no-sync (direct access)"]; + ValidateServInfoFiles [label="Validate servinfo files with ping\nremove any files which do not respond to ping"]; + + CheckHost [label="Verify that current host matches\nexisting servinfo files host"] + StartTCPServer_prime -> GetServInfoFiles; + GetServInfoFiles -> CreateServInfoFile [label="No servinfo\nfiles"]; + GetServInfoFiles -> ValidateServInfoFiles; + ValidateServInfoFiles -> CreateServInfoFile [label="No valid files"]; + CreateServInfoFile -> GetServInfoFiles [label="servinfo file created"]; + KeepRunning [label="READY"]; + + ValidateServInfoFiles -> CheckHost; + CheckHost -> RegisterProcess [label="Have valid\nservinfo files and same host"]; + RegisterProcess -> KeepRunning; + CheckHost -> EXIT [label="Not same host"]; + } +} + Index: docs/manual/server.png ================================================================== --- docs/manual/server.png +++ docs/manual/server.png cannot compute difference between binary files Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -236,11 +236,10 @@ (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -256,15 +255,16 @@ (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-state-status-by-id run-id test-id)) (state (car test-info));; (db:test-get-state test-info)) (status (cdr test-info));; (db:test-get-status test-info)) + (killreq (equal? state "KILLREQ")) (kill-reason "no kill reason specified") (kill-job? #f)) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond - ((test-get-kill-request run-id test-id) + (killreq (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) @@ -276,16 +276,11 @@ (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (if (common:low-noise-print 600 "run zombie") ;; every five minutes is plenty (launch:handle-zombie-tests run-id)) (when do-sync - ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) - ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))) - ) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) @@ -331,20 +326,64 @@ ))) (mutex-unlock! m) ;; no point in sticking around. Exit now. But run end of run before exiting? (launch:end-of-run-check run-id) (exit))) - (if (hash-table-ref/default misc-flags 'keep-going #f) + (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (begin - (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses - (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta - (loop (calc-minutes) - (or new-cpu-load cpu-load) - (or new-disk-free disk-free) - (if do-sync (current-seconds) last-sync))))))) - (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional - + (thread-sleep! 6) ;; was 3 + (loop (calc-minutes) + (or new-cpu-load cpu-load) + (or new-disk-free disk-free) + (if do-sync (current-seconds) last-sync)))))) + (tests:update-central-meta-info run-id test-id (commonmod:get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + + +;; read testconfig and create .logpro and script files +;; - use #f for tconfigreg to re-read the testconfigs from disk +;; +(define (launch:extract-scripts-logpro test-dir test-name item-path tconfigreg-in) + (let* ((tconfigreg (or tconfigreg-in + (tests:get-all))) + (tconfig-fname (conc test-dir "/.testconfig")) + (tconfig-tmpfile (conc tconfig-fname ".tmp")) + (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) + (scripts (configf:get-section tconfig "scripts")) + (logpros (configf:get-section tconfig "logpro"))) + ;; create .testconfig file + (configf:write-alist tconfig tconfig-tmpfile) + (file-move tconfig-tmpfile tconfig-fname #t) + (delete-file* ".final-status") + + ;; extract scripts from testconfig and write them to files in test run dir + (for-each + (lambda (scriptdat) + (match scriptdat + ((name content) + (debug:print-info 2 *default-log-port* "Creating script "(current-directory)"/"name) + (with-output-to-file name + (lambda () + (print content))) + (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))) + (else + (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) + scripts) + + ;; extract logpro from testconfig and write them to files in test run dir + (for-each + (lambda (logprodat) + (match logprodat + ((name content) + (debug:print-info 2 *default-log-port* "Creating logpro file "(current-directory)"/"name".logpro") + (with-output-to-file (conc name".logpro") + (lambda () + (print content) + ;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu)) + ))) + (else + (debug:print-info 0 "Invalid logpro definiton found in [logpro] section of testconfig. \"" logprodat "\"")))) + logpros))) (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) @@ -616,12 +655,11 @@ (if mt-bindir-path (setenv "PATH" (conc tmppath":"mt-bindir-path)))) ;;(bb-check-path msg: "launch:execute post block 4") ;; (change-directory top-path) ;; Can setup as client for server mode now ;; (client:setup) - - + ;; environment overrides are done *before* the remaining critical envars. (alist->env-vars env-ovrd) ;;(bb-check-path msg: "launch:execute post block 41") (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;;(bb-check-path msg: "launch:execute post block 42") @@ -647,37 +685,39 @@ (set! fullrunscript "xterm") (if (and fullrunscript (common:file-exists? fullrunscript) (not (file-execute-access? fullrunscript))) (system (conc "chmod ug+x " fullrunscript)))) - - ;; We are about to actually kick off the test - ;; so this is a good place to remove the records for - ;; any previous runs - ;; (db:test-remove-steps db run-id testname itemdat) - ;; now is also a good time to write the .testconfig file - (let* ((tconfig-fname (conc work-area "/.testconfig")) - (tconfig-tmpfile (conc tconfig-fname ".tmp")) - (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) - (scripts (configf:get-section tconfig "scripts"))) - ;; create .testconfig file - (configf:write-alist tconfig tconfig-tmpfile) - (file-move tconfig-tmpfile tconfig-fname #t) - (delete-file* ".final-status") - - ;; extract scripts from testconfig and write them to files in test run dir - (for-each - (lambda (scriptdat) - (match scriptdat - ((name content) - (with-output-to-file name - (lambda () - (print content) - (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))))) - (else - (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) - scripts)) + (launch:extract-scripts-logpro work-area test-name item-path tconfigreg) + +;;;;; ;; We are about to actually kick off the test +;;;;; ;; so this is a good place to remove the records for +;;;;; ;; any previous runs +;;;;; ;; (db:test-remove-steps db run-id testname itemdat) +;;;;; ;; now is also a good time to write the .testconfig file +;;;;; (let* ((tconfig-fname (conc work-area "/.testconfig")) +;;;;; (tconfig-tmpfile (conc tconfig-fname ".tmp")) +;;;;; (tconfig (tests:get-testconfig test-name item-path tconfigreg #t force-create: #t)) ;; 'return-procs))) +;;;;; (scripts (configf:get-section tconfig "scripts")) +;;;;; (precmd (configf:lookup tconfig ) +;;;;; ;; create .testconfig file +;;;;; (configf:write-alist tconfig tconfig-tmpfile) +;;;;; (file-move tconfig-tmpfile tconfig-fname #t) +;;;;; (delete-file* ".final-status") +;;;;; +;;;;; ;; extract scripts from testconfig and write them to files in test run dir +;;;;; (for-each +;;;;; (lambda (scriptdat) +;;;;; (match scriptdat +;;;;; ((name content) +;;;;; (with-output-to-file name +;;;;; (lambda () +;;;;; (print content) +;;;;; (change-file-mode name (bitwise-ior perm/irwxg perm/irwxu))))) +;;;;; (else +;;;;; (debug:print-info 0 "Invalid script definiton found in [scripts] section of testconfig. \"" scriptdat "\"")))) +;;;;; scripts)) ;; (let* ((m (make-mutex)) (kill-job? #f) (exit-info (make-launch:einf pid: #t exit-status: #t exit-code: #t rollup-status: 0)) ;; pid exit-status exit-code (i.e. process was successfully run) rollup-status @@ -694,11 +734,15 @@ (th2 (make-thread runit "run job")) (tconfig (tests:get-testconfig test-name item-path tconfigreg #t)) (propagate-exit-code (configf:lookup *configdat* "setup" "propagate-exit-code")) (propagate-status-list '("FAIL" "KILLED" "ABORT" "DEAD" "CHECK" "SKIP" "WAIVED")) (test-status "not set") - ) + (precmd (configf:lookup tconfig "setup" "precmd")) + (postcmd (configf:lookup tconfig "setup" "postcmd"))) + ;; first, if set, run the precmd + (if precmd ;; (file-exists? precmd)(file-execute-access? precmd)) + (system precmd)) ;; up to test author to put nbfake if desired. (set! job-thread th2) (thread-start! th1) (thread-start! th2) (thread-join! th2) (debug:print-info 0 *default-log-port* "Megatest execute of test " test-name ", item path " item-path " complete. Notifying the db ...") @@ -762,10 +806,13 @@ (set! test-status (db:test-get-status (rmt:get-testinfo-state-status run-id test-id))) ;; If the propagate-exit-code option has been set in the megatest config, and the test status matches the list, set the exit code to 1. + (if postcmd + (system postcmd)) + (if (and propagate-exit-code (string=? propagate-exit-code "yes") (member test-status propagate-status-list)) (begin (debug:print 1 *default-log-port* "Setting exit status to 1 because of test status of " test-status) (set! *globalexitstatus* 1) ) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -18,6 +18,6 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. ;; (declare (unit megatest-version)) -(define megatest-version 1.8017) +(define megatest-version 1.8028) Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -55,10 +55,12 @@ (declare (uses dbmod.import)) (declare (uses portlogger)) (declare (uses portlogger.import)) (declare (uses tcp-transportmod)) (declare (uses tcp-transportmod.import)) +(declare (uses apimod)) +(declare (uses apimod.import)) (declare (uses rmtmod)) (declare (uses rmtmod.import)) ;; (declare (uses debugprint)) ;; (declare (uses debugprint.import)) @@ -72,10 +74,11 @@ commonmod dbfile portlogger tcp-transportmod rmtmod + apimod ) (define *db* #f) ;; this is only for the repl, do not use in general!!!! (include "common_records.scm") @@ -257,11 +260,13 @@ -debug N|N,M,O... : enable debug 0-N or N and M and O ... -debug-noprop N|M,M,O...: enable debug but do not propagate to subprocesses via MT_DEBUG -config fname : override the megatest.config file with fname -append-config fname : append fname to the megatest.config file -import-sexpr fname : import a sexpr file (use -list-runs % -dumpmode sexpr to create) - + -remove-dbs all : remove Megatest DBs before importing sexpr. (Use only with -import-sexpr) + -regen-testfiles : regenerate scripts and logpro files from testconfig, run in test context + Utilities -env2file fname : write the environment to fname.csh and fname.sh -envcap a : save current variables labeled as context 'a' in file envdat.db -envdelta a-b : output enviroment delta from context a to context b to -o fname set the output mode with -dumpmode csh, bash or ini @@ -378,10 +383,11 @@ "-envcap" "-envdelta" "-setvars" "-set-state-status" "-import-sexpr" + "-remove-dbs" ;; to be used only with -import-sexpr to remove megatest dbs first. "-period" ;; sync period in seconds "-timeout" ;; exit sync if timeout in seconds exceeded since last change ;; move runs stuff here "-remove-keep" @@ -466,10 +472,11 @@ "-local" ;; run some commands using local db access "-generate-html" "-generate-html-structure" "-list-run-time" "-list-test-time" + "-regen-testfiles" ;; misc queries "-list-disks" "-list-targets" "-list-db-targets" @@ -968,12 +975,14 @@ (tl (launch:setup)) (keys (keys:config-get-fields *configdat*))) (case (rmt:transport-mode) ((tcp) (let* ((timeout (server:expiration-timeout))) - (debug:print 0 *default-log-port* "INFO: Running using tcp method with server timeout of "timeout) + (debug:print 0 *default-log-port* "INFO: megatest -server starting on " (get-host-name) " for " dbfname " using tcp method with timeout of "timeout) (tt-server-timeout-param timeout) + (api:queue-processor) + (thread-start! (make-thread api:print-db-stats "print-db-stats")) (if dbfname (tt:start-server tl #f dbfname api:tcp-dispatch-request-make-handler keys) (begin (debug:print 0 *default-log-port* "ERROR: transport mode is tcp - -db is required.") (exit 1))))) @@ -1044,16 +1053,16 @@ (if (args:get-arg "-kill-servers") - + (let* ((tl (launch:setup)) ;; need this to initialize *toppath* (servdir (tt:get-servinfo-dir *toppath*)) (servfiles (glob (conc servdir "/*:*.db"))) (fmtstr "~10a~22a~10a~25a~25a~8a\n") - (dbfiles (append (glob (conc *toppath* "/.mtdb/main.db")) (glob (conc *toppath* "/.mtdb/?.db"))(glob (conc *toppath* "/.mtdb/??.db")))) + (dbfiles (if (file-exists? (conc *toppath* "/.mtdb/main.db")) (append (glob (conc *toppath* "/.mtdb/main.db")) (glob (conc *toppath* "/.mtdb/?.db"))(glob (conc *toppath* "/.mtdb/??.db"))) '())) (ttdat (make-tt areapath: *toppath*)) ) (format #t fmtstr "DB" "host:port" "PID" "age" "last mod" "state") (for-each (lambda (dbfile) @@ -1091,10 +1100,14 @@ sfiles ) ) ) dbfiles + ) + ;; remove this db, because otherwise metadata contains records for old servers, and this causes a problem with db:no-sync-get-lock-with-id. + (if (file-exists? (conc *toppath* "/.mtdb/no-sync.db")) + (delete-file (conc *toppath* "/.mtdb/no-sync.db")) ) (set! *didsomething* #t) (exit) ) ) @@ -2115,10 +2128,25 @@ (paths (tests:test-get-paths-matching keys target (args:get-arg "-test-files")))) (for-each (lambda (path) (print path)) paths)))))) +;;====================================================================== +;; Utils for test areas +;;====================================================================== + +(if (args:get-arg "-regen-testfiles") + (if (getenv "MT_TEST_RUN_DIR") + (begin + (launch:setup) + (change-directory (getenv "MT_TEST_RUN_DIR")) + (let* ((testname (getenv "MT_TEST_NAME")) + (itempath (getenv "MT_ITEMPATH"))) + (launch:extract-scripts-logpro (getenv "MT_TEST_RUN_DIR") testname itempath #f)) + (set! *didsomething* #t)) + (debug:print 0 *default-log-port* "ERROR: Must run -regen-testfiles in a test environment (i.e. test xterm from dashboard)"))) + ;;====================================================================== ;; Archive tests ;;====================================================================== ;; Archive tests matching target, runname, and testpatt (if (equal? (args:get-arg "-archive") "replicate-db") @@ -2132,13 +2160,13 @@ (exit 1))) (if (common:file-exists? (conc *toppath* "/megatest.db")) (begin (debug:print-info 1 *default-log-port* "File " (conc *toppath* "/megatest.db") " already exists. Please remove it before trying to replicate db") (exit 1))) - (if (and (common:get-db-tmp-area) (> (length (directory (common:get-db-tmp-area) #f)) 0)) + (if (and (common:make-tmpdir-name *toppath* "") (> (length (directory (common:make-tmpdir-name *toppath* "") #f)) 0)) (begin - (debug:print-info 1 *default-log-port* (common:get-db-tmp-area) " not empty. Please remove it before trying to replicate db") + (debug:print-info 1 *default-log-port* (common:make-tmpdir-name *toppath* "") " not empty. Please remove it before trying to replicate db") (exit 1))) ;; check if timestamp (let* ((source (args:get-arg "-source")) (src (if (not (equal? (substring source 0 1) "/")) (conc (current-directory) "/" source) @@ -2429,11 +2457,11 @@ (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") (exit 1))) ;; keep this one local ;; (open-run-close patch-db #f) - (let ((dbstructs (db:setup #f))) + (let ((dbstructs (db:setup))) (common:cleanup-db dbstructs full: #t)) (set! *didsomething* #t))) (if (args:get-arg "-cleanup-db") (begin @@ -2445,11 +2473,11 @@ ;; (if (not (server:choose-server *toppath* 'home?)) ;; (begin ;; (debug:print 0 *default-log-port* "Servers are not running on this host or no servers alive. Cannot run cleanup-db") ;; (exit 1))) - (let ((dbstructs (db:setup #f))) + (let ((dbstructs (db:setup))) (common:cleanup-db dbstructs)) (set! *didsomething* #t))) (if (args:get-arg "-mark-incompletes") (begin @@ -2506,11 +2534,11 @@ (let* ((toppath (launch:setup)) (dbstructs (if (and toppath ;; NOTE: server:choose-server is starting a server ;; either add equivalent for tcp mode or ???? #;(server:choose-server toppath 'home?)) - (db:setup #t) + (db:setup) #f))) ;; make-dbr:dbstruct path: toppath local: (args:get-arg "-local")) #f))) (if *toppath* (cond ((getenv "MT_RUNSCRIPT") ;; How to run megatest scripts @@ -2597,27 +2625,44 @@ (if (args:get-arg "-import-megatest.db") (begin (launch:setup) (db:multi-db-sync - (db:setup #f) + (db:setup) 'killservers 'dejunk 'adj-testids 'old2new ) (set! *didsomething* #t))) (if (args:get-arg "-import-sexpr") - (begin - (launch:setup) - (rmt:import-sexpr (args:get-arg "-import-sexpr")) - (set! *didsomething* #t))) + (let*( + (toppath (launch:setup)) + (tmppath (common:make-tmpdir-name toppath ""))) + (if (file-exists? (conc toppath "/.mtdb")) + (if (args:get-arg "-remove-dbs") + (let* ((dbfiles (conc toppath "/.mtdb/* " tmppath "/*"))) + (debug:print 0 *default-log-port* "Removing db files: " dbfiles) + (system (conc "rm -rvf " dbfiles)) + ) + (begin + (debug:print 0 *default-log-port* "ERROR: Cannot import sexpr with an existing DB present.") + (debug:print 0 *default-log-port* "Add '-remove-dbs all' to remove the current Megatest DBs.") + (set! *didsomething* #t) + (exit) + ) + ) + (debug:print 0 *default-log-port* "Did not find " (conc toppath "/.mtdb")) + ) + (db:setup) + (rmt:import-sexpr (args:get-arg "-import-sexpr")) + (set! *didsomething* #t))) (if (args:get-arg "-sync-to-megatest.db") (let* ((duh (launch:setup)) - (dbstruct (db:setup #t)) + (dbstruct (db:setup)) (tmpdbpth (dbr:dbstruct-tmppath dbstruct)) (lockfile (conc tmpdbpth ".lock")) (locked (common:simple-file-lock lockfile)) (res (if locked (db:multi-db-sync @@ -2633,10 +2678,11 @@ (if (args:get-arg "-sync-to") (let ((toppath (launch:setup))) (tasks:sync-to-postgres *configdat* (args:get-arg "-sync-to")) (set! *didsomething* #t))) + ;; use with -from and -to ;; (if (args:get-arg "-db2db") (let* ((duh (launch:setup)) @@ -2650,18 +2696,19 @@ (sync-timeout (if sync-timeout-in (string->number sync-timeout-in) #f)) (lockfile (conc dest-db".sync-lock")) (keys (db:get-keys #f)) (thesync (lambda (last-update) (debug:print-info 0 *default-log-port* "Attempting to sync data from "src-db" to "dest-db"...") + (debug:print-info 0 *default-log-port* "PID = " (current-process-id)) (if (not (file-exists? dest-db)) (begin (debug:print 0 *default-log-port* "Using copy to create "dest-db" from "src-db) (file-copy src-db dest-db) 1) (let ((res (dbmod:db-to-db-sync src-db dest-db last-update (dbfile:db-init-proc) keys))) (if res - (debug:print-info 0 *default-log-port* "Synced " res " records from "src-db" to "dest-db) + (debug:print-info 2 *default-log-port* "Synced " res " records from "src-db" to "dest-db) (debug:print-info 0 *default-log-port* "No sync due to permissions or other issue.")) res)))) (start-time (current-seconds)) (synclock-mod-time (if (file-exists? lockfile) (handle-exceptions @@ -2673,11 +2720,17 @@ ) (if (and src-db dest-db) (if (file-exists? src-db) (if (and (file-exists? lockfile) (< age 20)) (debug:print 0 *default-log-port* "Lock "lockfile" exists, skipping sync...") - (begin + (begin + (if (file-exists? lockfile) + (begin + (debug:print 0 *default-log-port* "Deleting old lock file " lockfile) + (delete-file lockfile) + ) + ) (dbfile:with-simple-file-lock lockfile (lambda () (let loop ((last-changed (current-seconds)) (last-update 0)) @@ -2694,11 +2747,11 @@ (> sync-timeout (- now-time last-changed))) (begin (if sync-period (thread-sleep! sync-period)) (loop (if (> changes 0) now-time last-changed) now-time)))))))) (debug:print 0 *default-log-port* "Releasing lock file " lockfile) - ) + ) ) (debug:print 0 *default-log-port* "No sync due to unreadble or non-existant source file"src-db)) (debug:print 0 *default-log-port* "Usage for -db2db; -to and -from must be specified")) (set! *didsomething* #t))) Index: mt.scm ================================================================== --- mt.scm +++ mt.scm @@ -15,11 +15,13 @@ ;; You should have received a copy of the GNU General Public License ;; along with Megatest. If not, see . ;; -(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking (srfi 18) posix-extras directory-utils call-with-environment-variables) +(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking (srfi 18) posix-extras directory-utils + call-with-environment-variables) + (import (prefix sqlite3 sqlite3:)) (declare (unit mt)) (declare (uses debugprint)) (declare (uses db)) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -23,15 +23,47 @@ (declare (uses dbmod)) (module portlogger * -(import scheme chicken data-structures) -(import srfi-1 posix srfi-69 hostinfo dot-locking z3 - (srfi 18) extras s11n) +(import scheme) + +(cond-expand + (chicken-4 + (import chicken data-structures) + (import posix + ;; hostinfo + ;; dot-locking + extras + ) + + (import (prefix sqlite3 sqlite3:)) + (import debugprint dbmod) + ) + (chicken-5 + (import chicken.base + chicken.condition + chicken.file + chicken.pathname + chicken.process-context.posix + chicken.process + chicken.sort + chicken.string + chicken.time + chicken.random + + system-information + ) + (define file-write-access? file-writable?) + (define random pseudo-random-integer) + )) + +(import srfi-1 srfi-69 z3 + (srfi 18) s11n) (import (prefix sqlite3 sqlite3:)) (import debugprint dbmod) + ;; lsof -i (define (portlogger:open-db fname) (let* ((avail (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away (exists (file-exists? fname)) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -67,83 +67,73 @@ (else #f))) ;;====================================================================== (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id - -;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) -;; -(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected - (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") - - (if (not (eq? (rmt:transport-mode) 'nfs)) - (begin - (if (> attemptnum 2) - (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) - - (cond - ((> attemptnum 2) (thread-sleep! 0.05)) - ((> attemptnum 10) (thread-sleep! 0.5)) - ((> attemptnum 20) (thread-sleep! 1))) - - ;; I'm turning this off, it may make sense to move it - ;; into http-transport-handler - (if (and (> attemptnum 5) (= 0 (modulo attemptnum 15))) - (begin - (debug:print 0 *default-log-port* "ERROR: can't connect to server, trying to start a server.") - (case (rmt:transport-mode) - ((http) - (server:run *toppath*) - (thread-sleep! 3)) - (else - (thread-sleep! 1) ;; for tcp the server is started by routines in tcp-transportmod. For nfs there is no server - )))))) - - ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote - ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. - ;; 3. do the query, if on homehost use local access - ;; - (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value - (areapath *toppath*);; TODO - resolve from dbstruct to be compatible with multiple areas - (runremote (or area-dat - *runremote*)) - (attemptnum (+ 1 attemptnum)) - (readonly-mode (rmtmod:calc-ro-mode runremote *toppath*)) - (testsuite (common:get-testsuite-name)) - (mtexe (common:find-local-megatest))) - - (case (rmt:transport-mode) - ((http)(http-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode)) - ((tcp) (tcp-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe)) - ((nfs) (nfs-transport-handler runremote cmd rid params attemptnum area-dat areapath readonly-mode testsuite mtexe)) - ))) - -(define (nfs-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) - (let* ((keys (common:get-fields *configdat*)) - (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath tmpadj: "/dashboard"))) - (api:dispatch-request dbstruct cmd run-id params))) - -(define (tcp-transport-handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode testsuite mtexe) - (if (not runremote) - (let* ((newremote (make-and-init-remote areapath))) - (set! *runremote* newremote) - (set! runremote newremote))) - (let* ((dbfname (conc (dbfile:run-id->dbnum run-id)".db"))) ;;(dbfile:run-id->path areapath run-id))) - (tt:handler runremote cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))) - -(define (rmt:print-db-stats) - (let ((fmtstr "~40a~7-d~9-d~20,2-f")) ;; "~20,2-f" - (debug:print 18 *default-log-port* "DB Stats\n========") - (debug:print 18 *default-log-port* (format #f "~40a~8a~10a~10a" "Cmd" "Count" "TotTime" "Avg")) - (for-each (lambda (cmd) - (let ((cmd-dat (hash-table-ref *db-stats* cmd))) - (debug:print 18 *default-log-port* (format #f fmtstr cmd (vector-ref cmd-dat 0) (vector-ref cmd-dat 1) (/ (vector-ref cmd-dat 1)(vector-ref cmd-dat 0)))))) - (sort (hash-table-keys *db-stats*) - (lambda (a b) - (> (vector-ref (hash-table-ref *db-stats* a) 0) - (vector-ref (hash-table-ref *db-stats* b) 0))))))) - +(define *ttdat* #f) +;; how to make area-dat +(define (rmt:set-ttdat areapath ttdat) + (if ttdat + ttdat + (if *ttdat* + *ttdat* + (begin + (debug:print-info 2 *default-log-port* "rmt:set-ttdat: Initialize new ttdat") + (let* ((newremote (make-and-init-remote areapath))) + (set! *ttdat* newremote) + newremote + ) + ) + ) + ) +) + +;; NB// area-dat replaced by ttdat +;; +(define (rmt:send-receive cmd run-id params #!key (attemptnum 1)(ttdat #f)) + (assert (or (not run-id) (number? run-id)) "FATAL: run-id is required to be a number or #f") + (assert *toppath* "FATAL: rmt:send-receive called with *toppath* not set.") + (let* ((areapath *toppath*) ;; TODO - resolve from dbstruct to be compatible with multiple areas + (readonly-mode (rmtmod:calc-ro-mode ttdat *toppath*)) + (testsuite (common:get-testsuite-name))) + (case (rmt:transport-mode) + ((tcp) + (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value + (attemptnum (+ 1 attemptnum)) + (mtexe (common:find-local-megatest)) + (dbfname (conc (dbfile:run-id->dbnum run-id)".db")) + (ttdat (rmt:set-ttdat areapath ttdat)) + (conn (tt:get-conn ttdat dbfname)) + (is-main (equal? dbfname "main.db")) ;; why not (not run-id) ? + (server-start-proc (if is-main + #f + (lambda () + ;; (debug:print-info 0 *default-log-port* "starting server for dbfname: "dbfname) + (rmt:start-server ;; tt:server-process-run + areapath + testsuite ;; (dbfile:testsuite-name) + mtexe + run-id))))) + ;; here we look at ttdat, if dbfname is NOT main.db we check that a conn exists for it + ;; and if there is no conn we first send a request to the main.db server to start a + ;; server for the dbfname. + #;(if (and (not is-main)(not conn)) ;; no existing connection to non-main server, call in a start up request + (begin + (server-start-proc) + (thread-sleep! 1))) + (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))) + ((nfs) + (nfs-transport-handler cmd run-id params attemptnum areapath readonly-mode testsuite)) + (else + (debug:print-info 0 *default-log-port* "rmt:transport-mode is "(rmt:transport-mode)) + (assert #f "FATAL: rmt:transport-mode set to invalid value."))))) + +(define (nfs-transport-handler cmd run-id params attemptnum areapath readonly-mode testsuite) + (let* ((keys (common:get-fields *configdat*)) + (dbstruct (dbmod:nfs-get-dbstruct run-id keys (dbfile:db-init-proc) areapath))) + (api:dispatch-request dbstruct cmd run-id params))) + (define (rmt:get-max-query-average run-id) (mutex-lock! *db-stats-mutex*) (let* ((runkey (conc "run-id=" run-id " ")) (cmds (filter (lambda (x) (substring-index runkey x)) @@ -167,12 +157,12 @@ (mutex-unlock! *db-stats-mutex*) res)) (define (rmt:open-qry-close-locally cmd run-id params #!key (remretries 5)) (let* ((qry-is-write (not (member cmd api:read-only-queries))) - (db-file-path (db:dbfile-path)) ;; 0)) - (dbstructs-local (db:setup #t)) + (db-file-path (common:make-tmpdir-name *toppath* "")) ;; 0)) + (dbstructs-local (db:setup)) (read-only (not (file-write-access? db-file-path))) (start (current-milliseconds)) (resdat (if (not (and read-only qry-is-write)) (let ((v (api:execute-requests dbstructs-local (vector (symbol->string cmd) params)))) ;; (handle-exceptions ;; there has been a long history of receiving strange errors from values returned by the client when things go wrong.. @@ -204,11 +194,11 @@ ;; (rmt:update-db-stats run-id cmd params duration) ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it (if qry-is-write (let ((start-time (current-seconds))) (mutex-lock! *db-multi-sync-mutex*) -/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) + (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) (mutex-unlock! *db-multi-sync-mutex*))))) res)) ;;====================================================================== ;; @@ -221,12 +211,12 @@ ;;====================================================================== (define (rmt:kill-server run-id) (rmt:send-receive 'kill-server run-id (list run-id))) -(define (rmt:start-server run-id) - (rmt:send-receive 'start-server 0 (list run-id))) +(define (rmt:start-server areapath testsuite mtexe run-id) ;; run on main.db server + (rmt:send-receive 'start-server #f (list areapath testsuite mtexe run-id))) ;;====================================================================== ;; M I S C ;;====================================================================== @@ -235,16 +225,16 @@ ;; This login does no retries under the hood - it acts a bit like a ping. ;; Deprecated for nmsg-transport. ;; ;; (define (rmt:login-no-auto-client-setup runremote) -;; (rmt:send-receive-no-auto-client-setup runremote 'login 0 (list *toppath* megatest-version (client:get-signature)))) +;; (rmt:send-receive-no-auto-client-setup runremote 'login #f (list *toppath* megatest-version (client:get-signature)))) ;; given a hostname, return a pair of cpu load and update time representing latest intelligence from tests running on that host (define (rmt:get-latest-host-load hostname) - (rmt:send-receive 'get-latest-host-load 0 (list hostname))) + (rmt:send-receive 'get-latest-host-load #f (list hostname))) (define (rmt:sdb-qry qry val run-id) ;; add caching if qry is 'getid or 'getstr (rmt:send-receive 'sdb-qry run-id (list qry val))) @@ -502,12 +492,12 @@ (rmt:send-receive 'delete-run #f (list run-id))) (define (rmt:update-run-stats run-id stats) (rmt:send-receive 'update-run-stats #f (list run-id stats))) -(define (rmt:delete-old-deleted-test-records) - (rmt:send-receive 'delete-old-deleted-test-records #f '())) +(define (rmt:delete-old-deleted-test-records run-id) + (rmt:send-receive 'delete-old-deleted-test-records run-id (list run-id))) (define (rmt:get-runs runpatt count offset keypatts) (rmt:send-receive 'get-runs #f (list runpatt count offset keypatts))) (define (rmt:simple-get-runs runpatt count offset target last-update) Index: rmtmod.scm ================================================================== --- rmtmod.scm +++ rmtmod.scm @@ -86,33 +86,41 @@ (define (rmt:import-run target run-dat) (let* ((runname (car run-dat)) (all-dat (cdr run-dat)) (tests-data (alist-ref "data" all-dat equal?)) (run-meta (alist-ref "meta" all-dat equal?)) - (run-id (rmt:insert-run target runname run-meta))) + (run-id (string->number (alist-ref "id" run-meta equal?)))) + + (rmt:insert-run run-id target runname run-meta) (for-each (lambda (test-dat) (let* ((test-id (car test-dat)) (test-rec (cdr test-dat))) (rmt:insert-test run-id test-rec))) tests-data))) ;; insert run if not there, return id either way -(define (rmt:insert-run target runname run-meta) +(define (rmt:insert-run run-id target runname run-meta) ;; look for id, return if found (debug:print 0 *default-log-port* "Insert run: "target"/"runname) (let* ((runs (rmtmod:send-receive 'simple-get-runs #f ;; runpatt count offset target last-update) (list runname #f #f target #f)))) (if (null? runs) - (rmtmod:send-receive 'insert-run #f (list target runname run-meta)) - (simple-run-id (car runs))))) + (begin + (debug:print 0 *default-log-port* "inserting run for runname " runname " target " target) + (rmtmod:send-receive 'insert-run #f (list run-id target runname run-meta)) + ) + (begin + (debug:print 0 *default-log-port* "Found run-id " (simple-run-id (car runs)) " for runname " runname " target " target) + (simple-run-id (car runs) + ) + )))) (define (rmt:insert-test run-id test-rec) (let* ((testname (alist-ref "testname" test-rec equal?)) (item-path (alist-ref "item_path" test-rec equal?))) - (debug:print 0 *default-log-port* " Insert test in run "run-id": "testname"/"item-path) (rmtmod:send-receive 'insert-test run-id test-rec))) ;;====================================================================== ;; T E S T S ;;====================================================================== @@ -192,21 +200,28 @@ (define (rmt:get-toplevels-and-incompletes run-id running-deadtime remotehoststart-deadtime) (rmtmod:send-receive 'get-toplevels-and-incompletes run-id (list run-id running-deadtime remotehoststart-deadtime))) +;; .final-status file is two lines: +;; "state" +;; "status" +;; (define (rmt:get-status-from-final-status-file run-dir) (let ((infile (conc run-dir "/.final-status"))) - ;; first verify we are able to write the output file + ;; first verify we are able to read the output file (if (not (file-read-access? infile)) (begin (debug:print 2 *default-log-port* "ERROR: cannot read " infile) (debug:print 2 *default-log-port* "ERROR: run-dir is " run-dir) #f ) - (with-input-from-file infile read-lines) - ))) + (let ((res (with-input-from-file infile read-lines))) + (if (null? res) + #f + res))))) ;; (string-split (car res))))))) <== I would have preferred a single line STATE STATUS without "'s + ;; (string-split (car res))))))) ;; DUNNO WHICH IS CORRECT ;; select end_time-now from ;; (select testname,item_path,event_time+run_duration as ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -346,11 +346,11 @@ (args:get-arg "-one-pass")) (exit 0)) (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) - (let* ((num-running (rmt:get-count-tests-running run-id)) + (let* ((num-running (rmt:get-count-tests-running-for-run-id run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) @@ -2064,11 +2064,11 @@ ;; ;; There is now a single call to runs:update-all-test_meta and this ;; per-test call is not needed. Given the delicacy of the move to ;; v1.55 this code is being left in place for the time being. ;; - (if (not (hash-table-ref/default *test-meta-updated* test-name #f)) + (if (not (hash-table-exists? *test-meta-updated* test-name)) (begin (hash-table-set! *test-meta-updated* test-name #t) (runs:update-test_meta test-name test-conf))) ;; itemdat => ((ripeness "overripe") (temperature "cool") (season "summer")) @@ -2434,10 +2434,12 @@ ((kill-runs) (tasks:kill-runner target run-name "%") (debug:print 1 *default-log-port* "Killing tests for run: " runkey " " (db:get-value-by-header run header "runname")) ) ((remove-runs) + ;; use this location to cleanup old DELETED records? No. See below for same call + ;; (rmt:delete-old-deleted-test-records run-id) ;; (if (tasks:need-server run-id)(tasks:start-and-wait-for-server tdbdat run-id 10)) ;; seek and kill in flight -runtests with % as testpatt here ;; (if (equal? testpatt "%") (tasks:kill-runner target run-name testpatt) ;; (debug:print 0 *default-log-port* "not attempting to kill any run launcher processes as testpatt is " testpatt)) @@ -2724,11 +2726,11 @@ (debug:print 1 *default-log-port* "Removing target " target "run: " run-name) (if (not keep-records) (begin (debug:print 1 *default-log-port* "Removing DB records for the run.") (rmt:delete-run run-id) - (rmt:delete-old-deleted-test-records)) + (rmt:delete-old-deleted-test-records run-id)) ) (if (not (equal? linkspath "/does/not/exist/I")) (begin (debug:print 1 *default-log-port* "Recursively removing links dir " linkspath) (runs:recursive-delete-with-error-msg linkspath))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -728,11 +728,11 @@ ;; #t ;; #f))) ;; timeout is hms string: 1h 5m 3s, default is 1 minute ;; This is currently broken. Just use the number of hours with no unit. -;; Default is 60 seconds. +;; Default is 600 seconds. ;; (define (server:expiration-timeout) (let* ((tmo (configf:lookup *configdat* "server" "timeout"))) (if (string? tmo) (let* ((num (string->number tmo))) Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -84,11 +84,11 @@ (tasks:open-db numretries (- numretries 1))) (begin (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* " exn=" (condition->list exn)))) - (let* ((dbpath (db:dbfile-path )) ;; (tasks:get-task-db-path)) + (let* ((dbpath (common:make-tmpdir-name *toppath* "")) ;; (tasks:get-task-db-path)) (dbfile (conc dbpath "/monitor.db")) (avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away (exists (common:file-exists? dbpath)) (write-access (file-write-access? dbpath)) (mdb (cond ;; what the hek is *toppath* doing here? Index: tcp-transportmod.scm ================================================================== --- tcp-transportmod.scm +++ tcp-transportmod.scm @@ -28,25 +28,52 @@ (use address-info tcp) (module tcp-transportmod * - (import scheme - (prefix sqlite3 sqlite3:) - chicken - data-structures +(import scheme) - address-info - directory-utils +(cond-expand + (chicken-4 + (import (prefix sqlite3 sqlite3:) + chicken extras - files hostinfo + + ports + posix + files + data-structures + tcp + )) + (chicken-5 + (import chicken.base + chicken.condition + chicken.file + chicken.pathname + chicken.process-context.posix + chicken.process + chicken.sort + chicken.string + chicken.time + chicken.tcp + chicken.random + chicken.file.posix + chicken.pretty-print + chicken.io + chicken.port + chicken.process-context + + system-information) + (define unsetenv unset-environment-variable!) + )) + + (import address-info + directory-utils matchable md5 message-digest - ports - posix regex regex-case s11n srfi-1 srfi-18 @@ -53,11 +80,10 @@ srfi-4 srfi-69 stack typed-records tcp-server - tcp debugprint commonmod dbfile dbmod @@ -110,11 +136,12 @@ ;; parameters ;; (define tt-server-timeout-param (make-parameter 600)) ;; make ttdat visible -(define *server-info* #f) +;; (define *server-info* #f) ;; get this from commonmod +(define *server-run* #t) (define (tt:make-remote areapath) (make-tt areapath: areapath)) ;; 1 ... or #f @@ -125,33 +152,46 @@ (and (or (number? run-id) (not run-id)) (equal? (dbfile:run-id->dbfname run-id) dbfname))) (tcp-buffer-size 2048) -;; (max-connections 4096) +;; (max-connections 4096) + +(define (tt:get-conn ttdat dbfname) + (hash-table-ref/default (tt-conns ttdat) dbfname #f)) ;; do all the busy work of finding and setting up conn for ;; connecting to a server ;; -(define (tt:client-connect-to-server ttdat dbfname run-id testsuite) +(define (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc) (assert (tt:valid-run-id run-id dbfname) "FATAL: invalid run-id "run-id) - (let* ((conn (hash-table-ref/default (tt-conns ttdat) dbfname #f)) - (server-start-proc (lambda () - (tt:server-process-run - (tt-areapath ttdat) - testsuite ;; (dbfile:testsuite-name) - (common:find-local-megatest) - run-id)))) + (debug:print-info 2 *default-log-port* "tt:client-connect-to-server " dbfname " " run-id) + (let* ((conn (tt:get-conn ttdat dbfname)) + (server-start-proc (or server-start-proc + (lambda () + (assert (equal? dbfname "main.db") ;; only main.db is started here + "FATAL: called server-start-proc for db other than main.db") + (tt:server-process-run + (tt-areapath ttdat) + testsuite ;; (dbfile:testsuite-name) + (common:find-local-megatest) + run-id))))) (if conn (begin - ; (debug:print-info 0 *default-log-port* "already connected to the server") + (debug:print-info 2 *default-log-port* "already connected to a server") conn) ;; we are already connected to the server - (let* ((sdat (tt:get-current-server-info ttdat dbfname))) - (match sdat + + ;; no conn + (let* ((sdats (tt:get-server-info-sorted ttdat dbfname)) + (sdat (if (null? sdats) + #f + (car sdats)))) + (debug:print-info 2 *default-log-port* "found sdat " sdat) + (match sdat ((host port start-time server-id pid dbfname2 servinffile) (assert (equal? dbfname dbfname2) "FATAL: read server info from wrong file.") - ;(debug:print-info 0 *default-log-port* "in match servinffile:" servinffile) + (debug:print-info 2 *default-log-port* "no conn - in match servinffile:" servinffile) (let* ((host-port (conc host":"port)) (conn (make-tt-conn host: host port: port host-port: host-port @@ -162,36 +202,52 @@ pid: pid))) ;; verify we can talk to this server (let* ((result (tt:timed-ping host port server-id)) (ping-res (car result)) (ping (cdr result))) - (debug:print-info 0 *default-log-port* "ping time: " ping) + (debug:print-info 2 *default-log-port* "host " host " port " port " ping time: " ping " result " ping-res) (case ping-res ((running) + (debug:print-info 2 *default-log-port* "Setting conn = " conn " in hash table") (hash-table-set! (tt-conns ttdat) dbfname conn) ;;; is this ok to save before validating that the connection is good? conn) ((starting) (thread-sleep! 0.5) - (tt:client-connect-to-server ttdat dbfname run-id testsuite)) + (debug:print-info 0 *default-log-port* "server for " dbfname " is in starting state, retrying connect") + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc)) (else (let* ((curr-secs (current-seconds))) ;; rm the (last server) would go here (if (> (- curr-secs (tt-last-serv-start ttdat)) 10) (begin + (debug:print-info 0 *default-log-port* "Unreachable server at " + host":"port" with servinfo file "servinffile", removing it") + (if (file-exists? servinffile) + (handle-exceptions + exn + #f + (delete-file servinffile))) (tt-last-serv-start-set! ttdat curr-secs) - (server-start-proc))) ;; start server if 30 sec since last attempt + (debug:print-info 0 *default-log-port* "Starting a new server on " (get-host-name)) + (server-start-proc))) ;; start server if 10 sec since last attempt (thread-sleep! 1) - (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) + (debug:print-info 0 *default-log-port* "Retrying connect") + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) + (else ;; no good server found, if haven't started server in > 5 secs, start another - (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 5) ;; BUG - grow this number really do not want to swamp the machine with servers + (if (> (- (current-seconds) (tt-last-serv-start ttdat)) 3) ;; BUG - grow this number really do not want to swamp the machine with servers (begin - (debug:print-info 0 *default-log-port* "No server found. Starting one for run-id "run-id" in dbfile "dbfname) + (debug:print-info 0 *default-log-port* "Starting server for "dbfname " on " (get-host-name)) (server-start-proc) - (tt-last-serv-start-set! ttdat (current-seconds)))) + (tt-last-serv-start-set! ttdat (current-seconds)) + (thread-sleep! 6) + )) (thread-sleep! 1) - (tt:client-connect-to-server ttdat dbfname run-id testsuite))))))) + (debug:print-info 0 *default-log-port* "Connect to server from " (get-host-name) " for " dbfname) + (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))))))) +;; returns ( result . ping_time ) (define (tt:timed-ping host port server-id) (let* ((start-time (current-milliseconds)) (result (tt:ping host port server-id))) (cons result (- (current-milliseconds) start-time)))) @@ -222,18 +278,18 @@ ;; client side handler ;; ;;(tt:handler # get-keys #f () 2 #f "/home/matt/data/megatest/ext-tests" #f "main.db" "ext-tests" "/home/matt/data/megatest/bin/.22.04/../megatest") ;; -(define (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe) - ;; NOTE: areapath is passed in and in tt struct. We'll use passed in value for now. - (let* ((conn (tt:client-connect-to-server ttdat dbfname run-id testsuite))) ;; (hash-table-ref/default (tt-conns ttdat) dbfname #f))) +(define (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc) + ;; connect-to-server will start a server if needed. + (let* ((areapath (tt-areapath ttdat)) + (conn (tt:client-connect-to-server ttdat dbfname run-id testsuite server-start-proc))) ;; looks up conn keyed by dbfname (if conn ;; have connection, call the server (let* ((res (tt:send-receive ttdat conn cmd run-id params))) ;; res is (status errmsg result meta) - ; (debug:print 0 *default-log-port* "conn:" conn " res: " res) (match res ((status errmsg result meta) (if (list? meta) (let* ((delay-wait (alist-ref 'delay-wait meta))) (if (and (number? delay-wait) @@ -241,35 +297,36 @@ (begin (debug:print 0 *default-log-port* "Server is loaded, delaying "delay-wait" seconds") (thread-sleep! delay-wait))))) (case status ((busy) ;; result will be how long the server wants you to delay - (let* ((dly (if (number? result) result 0.1))) - (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, will try again in "dly" seconds.") + (let* ((raw-dly (if (number? result) result 0.1)) + (dly (+ raw-dly (/ attemptnum 10)))) ;; (* raw-dly (/ attemptnum 2)))) + (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is busy, cmd is "cmd", will try again in "dly" seconds. This is attempt "(- attemptnum 1)) (thread-sleep! dly) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe))) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc))) ((loaded) (debug:print 0 *default-log-port* "WARNING: server for "dbfname" is loaded, slowing queries.") (tt:backoff-incr (tt-conn-host conn)(tt-conn-port conn)) - result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + result) ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe)) (else result))) (else ;; did not receive properly formated result - (if (not res) ;; tt:handler is telling us that communication failed + (if (not res) ;; tt:send-receive telling us that communication failed (let* ((host (tt-conn-host conn)) (port (tt-conn-port conn)) ;; (dbfname (tt-conn-port conn)) ;; 192.168.0.127:4242-726924:4.db (pid (tt-conn-pid conn)) ;;(servinf (tt-conn-servinf-file conn))) (servinf (tt-servinf-file ttdat))) ;; (conc areapath"/.servinfo/"host":"port"-"pid":"dbfname))) ;; TODO, use (server:get-servinfo-dir areapath) - (hash-table-set! (tt-conns ttdat) dbfname #f) + (hash-table-set! (tt-conns ttdat) dbfname #f) ;; clear out the conn for this dbfname to force finding new server (if (and servinf (file-exists? servinf)) (begin (if (< attemptnum 10) (begin (thread-sleep! 0.5) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)) (begin (debug:print 0 *default-log-port* "INFO: no response from server "host":"port" for "dbfname) (if (and (file-exists? servinf) (> (- (current-seconds)(file-modification-time servinf)) 60)) (begin @@ -276,33 +333,30 @@ (debug:print 0 *default-log-port* "INFO: "servinf" file seems old and no ping response, removing it.") (handle-exceptions exn #f (delete-file* servinf)) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)) (begin ;; start server - addressed in client-connect-to-server ;; delay - addressed in client-connect-to-server ;; try again (thread-sleep! 0.25) ;; dunno, I think this needs to be here - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)) )))) (begin ;; no server file, delay and try again - (debug:print 0 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", but do not see servinf file "servinf) + (debug:print 2 *default-log-port* "INFO: connection to server "host":"port" broken for "dbfname", no servinf file. Server exited? ") (thread-sleep! 0.5) - (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe)))) + (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe server-start-proc)))) (begin ;; this case is where res is malformed. Probably should abort (assert #f "FATAL: tt:handler received bad data "res) ;; (debug:print 0 *default-log-port* "INFO: got corrupt data from server "host":"port", "res", for "dbfname", will try again.") - ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) area-dat areapath readonly-mode dbfname testsuite mtexe) + ;; (tt:handler ttdat cmd run-id params (+ attemptnum 1) readonly-mode dbfname testsuite mtexe) ))))) (begin (thread-sleep! 1) ;; no conn yet set up, give it a rest and try again - (tt:handler ttdat cmd run-id params attemptnum area-dat areapath readonly-mode dbfname testsuite mtexe))))) - -(define (tt:bid-for-servership run-id) - #f) + (tt:handler ttdat cmd run-id params attemptnum readonly-mode dbfname testsuite mtexe server-start-proc))))) ;; gets server info and appends path to server file ;; sorts by age, oldest first ;; ;; returns list of (host port startseconds server-id servinfofile) @@ -325,24 +379,10 @@ (debug:print 2 *default-log-port* "SERVER #"count": "(string-intersperse (map conc sorted) ", "))) (set! count (+ count 1))) sorted) sorted)) -(define (tt:get-current-server-info ttdat dbfname) - (assert (tt-areapath ttdat) "FATAL: areapath not set in ttdat.") - ;; - ;; TODO - replace most of below with tt;get-server-info-sorted - ;; - (let* ((areapath (tt-areapath ttdat)) - (sfiles (tt:find-server areapath dbfname)) - (sdats (filter car (map tt:server-get-info sfiles))) ;; first element is #f if the file disappeared while being read - (sorted (sort sdats (lambda (a b) - (< (list-ref a 2)(list-ref b 2)))))) - (if (null? sorted) - #f ;; we'll want to wait until extra servers have exited - (car sorted)))) - (define (tt:send-receive ttdat conn cmd run-id params) (let* ((host-port (tt-conn-host-port conn)) ;; (conc (tt-conn-host conn)":"(tt-conn-port conn))) (host (tt-conn-host conn)) (port (tt-conn-port conn)) (dat (list cmd run-id params #f))) ;; no meta data yet @@ -379,18 +419,18 @@ (- wait-delay adj)) 0))) (if (> new-wait 0) (begin (if (common:low-noise-print 10 "delay wait message") - (debug:print-info 0 *default-log-port* "Server loaded, DelayWait: "new-wait)) + (debug:print-info 0 *default-log-port* "Server on host " host " loaded, DelayWait: "new-wait)) (tt:backoff-wait-delay-set! bkoff new-wait) (tt:backoff-last-adj-t-set! bkoff (current-seconds)) (thread-sleep! new-wait)) (hash-table-delete! *tt:backoff-smoothing* host-port)))))) (define (tt:send-receive-direct host port dat #!key (ping-mode #f)(tries-remaining 25)) - (assert (number? port) "FATAL: tt:send-receive-direct called with port not a number "port) + (assert (number? port) "FATAL: tt:send-receive-direct called with a port that is not a number "port) (tt:backoff-decr-and-wait host port) (let* ((retry (lambda () (tt:send-receive-direct host port dat tries-remaining: (- tries-remaining 1)))) (full-err-print (lambda (exn msg) (if (condition? exn) @@ -457,244 +497,211 @@ ;; start the listener and start responding to requests ;; ;; NOTE: organise by dbfname, not run-id so we don't need ;; to pull in more modules ;; -;; This is the routine called in megatest.scm to start a server +;; This is the routine called in megatest.scm to start a server. NOTE: sequence is different for main.db vs. X.db ;; ;; Server viability is checked in keep-running. Blindly start and run here. ;; (define (tt:start-server areapath run-id dbfname-in handler keys) (assert areapath "FATAL: areapath not provided for tt:start-server") - ;; is there already a server for this dbfile? Then exit. (let* ((ttdat (make-tt areapath: areapath)) - (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id))) - (servers (tt:find-server areapath dbfname))) ;; should use tt:get-current-server-info instead - (if (> (length servers) 4) - (begin - (debug:print 0 *default-log-port* "INFO: found server(s) already running for db "dbfname", "(string-intersperse servers ",")" Exiting.") - (exit)) - (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) - (tt-handler-set! ttdat (handler dbstruct)) - (let* ((tcp-thread (make-thread - (lambda () - (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data - "tcp-server-thread")) - (run-thread (make-thread - (lambda () - (tt:keep-running ttdat dbfname dbstruct))))) - (thread-start! tcp-thread) - (thread-start! run-thread) - - (let* ((areapath (tt-areapath ttdat)) - (nosyncdbpath (conc areapath"/.mtdb"))) - ;; this didn't seem to work, is port not available yet? - (let loop ((count 0)) - (if (tt-port ttdat) - (begin - (procinf-port-set! *procinf* (tt-port ttdat)) - (procinf-dbname-set! *procinf* dbfname) - (dbfile:with-no-sync-db - nosyncdbpath - (lambda (nsdb) - (dbfile:insert-or-update-process nsdb *procinf*)))) - (if (< count 5) - (begin - (thread-sleep! 0.5) - (loop (+ count 1))) - (debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set!")))) - - (thread-join! run-thread) ;; run thread will exit on timeout or other conditions - ;; replace with call to (dbfile:set-process-done nsdb host pid reason) - (procinf-status-set! *procinf* "done") - (procinf-end-set! *procinf* (current-seconds)) - (dbfile:with-no-sync-db - nosyncdbpath - (lambda (nsdb) - (dbfile:insert-or-update-process nsdb *procinf*))) - (debug:print 0 *default-log-port* "Exiting now.") - (exit))))))) + (dbfname (or dbfname-in (dbmod:run-id->dbfname run-id)))) + (set! *server-info* ttdat) + (let* ((dbstruct (dbmod:open-dbmoddb areapath run-id dbfname (dbfile:db-init-proc) keys))) + (tt-handler-set! ttdat (handler dbstruct)) + (let* ((servinf-created #f) + (tcp-thread (make-thread + (lambda () + ;; NOTE: tt-port and tt-host are set in connect-listener which is called under tt:start-tcp-server + (tt:start-tcp-server ttdat)) ;; start the tcp-server which applies handler to incoming data + "tcp-server-thread")) + (run-thread (make-thread + (lambda () + (tt:keep-running ttdat dbfname dbstruct))))) + (thread-start! tcp-thread) + + (let* ((areapath (tt-areapath ttdat)) + (nosyncdbpath (conc areapath"/.mtdb")) + (servers ;; (tt:find-server areapath dbfname))) + (tt:get-server-info-sorted ttdat dbfname)) ;; (host port startseconds server-id servinfofile) + (good-srvrs + ;; contact servers via ping, if no response remove the .servinfo file + (let loop ((servrs servers) + (prime-host #f) + (result '())) + (if (null? servrs) + (reverse result) + (let* ((servdat (car servrs))) + (match servdat + ((host port startseconds server-id servinfofile) + (let* ((ping-res (tt:timed-ping host port server-id)) + (good-ping (match ping-res + ((result . ping-time) + (not result)) ;; we couldn't reach the server or it was not a megatest server + (else #f))) ;; the ping failed completely? + (same-host (or (not prime-host) ;; i.e. this is the first host + (equal? prime-host host))) + (keep-srv (and good-ping same-host))) + (if keep-srv + (loop (cdr servrs) + host + (cons servdat result)) + (begin + (handle-exceptions + exn + (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile", " + (condition->list exn)) + (delete-file* servinfofile)) + (loop (cdr servrs) prime-host result))))) + (else + ;; can't delete it as we don't have a filename. NOTE: Should really never get here. + (debug:print-info 0 *default-log-port* "ERROR: bad servinfo record \""servdat"\"") + (loop (cdr servrs) prime-host result)) ;; drop + ))))) + (home-host (if (null? good-srvrs) + #f + (caar good-srvrs)))) + ;; by here we have a trustworthy list of servers and we have removed the .servinfo file for any unresponsive servers + ;; and the list is in good-srvrs + (cond + ((not home-host) ;; no servers yet, go ahead and start + (debug:print-info 0 *default-log-port* "No servers yet, starting on "(get-host-name))) + ((> (length good-srvrs) 2) ;; don't need more, just exit + (debug:print-info 0 *default-log-port* "Have "(length good-srvrs)", no need for more, exiting.") + (exit)) + ((not (equal? home-host (get-host-name))) ;; there is a home-host and we are not on it + (debug:print-info 0 *default-log-port* "Prime main server is on host "home-host", but we are on host "(get-host-name)", exiting.") + (exit)) + (else + (debug:print-info 0 *default-log-port* "Starting on host "(get-host-name)", along with "(length good-srvrs)" other servers."))) + + ;; this didn't seem to work, is port not available yet? + (let loop ((count 0)) + (if (tt-port ttdat) + (begin + (procinf-port-set! *procinf* (tt-port ttdat)) + (procinf-dbname-set! *procinf* dbfname) + (dbfile:with-no-sync-db + nosyncdbpath + (lambda (nsdb) + (dbfile:insert-or-update-process nsdb *procinf*)))) + (if (< count 10) + (begin + (thread-sleep! 0.25) + (loop (+ count 1))) + (begin + (debug:print 0 *default-log-port* "ERROR: (tt-port ttdat) no port set! Exiting.") + (exit))))) + + ;; create a servinfo file start keep-running + (debug:print 0 *default-log-port* "Creating servinfo file for " dbfname) + (tt:create-server-registration-file ttdat dbfname) + (procinf-status-set! *procinf* "running") + (tt-state-set! ttdat 'running) + (dbfile:with-no-sync-db + nosyncdbpath + (lambda (nsdb) + (dbfile:insert-or-update-process nsdb *procinf*))) + (thread-start! run-thread) + + (thread-join! run-thread) ;; run thread will exit on timeout or other conditions + + ;; (tcp-close (tt-socket ttdat)) ;; close up ports here + + ;; replace with call to (dbfile:set-process-done nsdb host pid reason) + (procinf-status-set! *procinf* "done") + (procinf-end-set! *procinf* (current-seconds)) + ;; either convert this to use set-process-done or get rid of set-process-done + (dbfile:with-no-sync-db + nosyncdbpath + (lambda (nsdb) + (dbfile:insert-or-update-process nsdb *procinf*))) + (debug:print 0 *default-log-port* "Exiting now.") + (exit)))))) (define (tt:keep-running ttdat dbfname dbstruct) - ;; verfiy conn for ready - ;; listener socket has been started by this stage - ;; wait for a port before creating the registration file - ;; - (let* ((db-locked-in #f) - (areapath (tt-areapath ttdat)) - (nosyncdbpath (conc areapath"/.mtdb")) - (cleanup (lambda () - (if (tt-cleanup-proc ttdat) - ((tt-cleanup-proc ttdat))) - (dbfile:with-no-sync-db nosyncdbpath - (lambda (db) - (let* ((dbtmpname (dbr:dbstruct-dbtmpname dbstruct))) - (debug:print-info 0 *default-log-port* "Running clean up, including removing db file "dbtmpname) - (db:no-sync-del! db dbfname) - #;(if dbtmpname - (delete-file dbtmpname)))))))) - (set! *server-info* ttdat) - (let loop ((count 0)) - (if (> count 240) - (begin - (debug:print 0 *default-log-port* "FATAL: Could not start a tcp server, giving up.") - (exit 1)) - (if (not (tt-port ttdat)) ;; no connection yet - (begin - (thread-sleep! 0.25) - (loop (+ count 1)))))) - - (tt:create-server-registration-file ttdat dbfname) - ;; now start watching the last-access, if it hasn't been touched - ;; in over ten seconds we exit - (thread-sleep! 0.05) ;; any real need for delay here? + + ;; at this point the server is running and responding to calls, we just monitor + ;; for db calls and exit if there are none. + + ;; if I am not in the first 3 servers, exit + (let* ((start-time (current-seconds))) (let loop () - (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) - (ok (cond - ((null? servers) #f) ;; not ok - ((equal? (list-ref (car servers) 6) ;; compare the servinfofile - (tt-servinf-file ttdat)) - (let* ((res (if db-locked-in - #t - (let* ((lock-result ;; this is the primary lock - need to double verify that got it - (dbfile:with-no-sync-db - nosyncdbpath - (lambda (db) - (db:no-sync-lock-and-check db dbfname - (tt-servinf-file ttdat) - ;; (dbr:dbstruct-dbtmpname dbstruct) - )))) - (success (car lock-result))) - (if success - (begin - (tt-state-set! ttdat 'running) - (debug:print 0 *default-log-port* "Got server lock for " dbfname) - (set! db-locked-in #t) - #t) - (begin - (debug:print 0 *default-log-port* "Failed to get server lock for "dbfname) - #f)))))) - (if (and res (common:low-noise-print 120 "top server message")) - (debug:print-info 0 *default-log-port* "Keep running, I'm the top server for " - dbfname" on "(tt-host ttdat)":"(tt-port ttdat))) - res)) - (else - (debug:print-info 0 *default-log-port* "I'm not the lead server: "servers) - (let* ((leadsrv (car servers))) - (match leadsrv - ((host port startseconds server-id pid dbfname servinfofile) - (let* ((result (tt:timed-ping host port server-id)) - (res (car result)) - (ping (cdr result))) - (debug:print-info 0 *default-log-port* "Ping to "host":"port", with server-id "server-id - ", and file "servinfofile" returned "res) - (if res - #f ;; not the server, but all good, want to exit - (if (and (file-exists? servinfofile) - (> (- (current-seconds)(file-modification-time servinfofile)) 30)) - (begin - ;; can't ping and file has been on disk 15 seconds, go ahead and try to remove it - (debug:print-info 0 *default-log-port* "Removing apparently dead server info file: "servinfofile) - (handle-exceptions - exn - (debug:print-info 0 *default-log-port* "Error removing server info file: "servinfofile) - (delete-file* servinfofile) - ) - #t) ;; not the server but the server is not reachable - (begin - (debug:print 0 *default-log-port* "I'm not the server but could not ping "host":"port", will try again.") - (thread-sleep! 1) ;; just because - #t))))) - (else ;; should never get here - (debug:print 0 *default-log-port* "BAD SERVER RECORD: "leadsrv) - (assert #f "Bad server record "leadsrv)))))))) + (let* ((servers (tt:get-server-info-sorted ttdat dbfname)) + (home-host (if (null? servers) + #f + (caar servers))) + (my-index (list-index (lambda (x) + (equal? (list-ref x 6) + (tt-servinf-file ttdat))) + servers)) + (ok (cond + ((not (number? my-index)) + (debug:print 0 *default-log-port* "ERROR: bad server data in "servers", might be due to host misconfiguration such as bad IP address in /etc/hosts.") + #f) + ((not *server-run*) + (debug:print 0 *default-log-port* "WARNING: received a stop server from client by remote request.") + #f) + ((null? servers) + (debug:print 0 *default-log-port* "WARNING: no servinfo files found, this cannot be.") + #f) ;; not ok + ((> my-index 2) + (debug:print 0 *default-log-port* "WARNING: there are more than two servers ahead of me, I'm not needed, exiting.") + #f) ;; not ok to not be in first three + ((eq? (tt-state ttdat) 'running) #t) ;; we are good to keep going + ((> (- (current-seconds) start-time) 30) + (debug:print 0 *default-log-port* "WARNING: over 30 seconds and not yet in runnning mode. Exiting.") + #f) + (else #t)))) (if ok (tt-last-access-set! ttdat *db-last-access*) ;; bit silly, just use db-last-access (begin (debug:print 0 *default-log-port* "Exiting immediately") - (cleanup) + (tt:shutdown-server ttdat) (exit))) (let* ((last-update (dbr:dbstruct-last-update dbstruct)) (curr-secs (current-seconds))) (if (and (eq? (tt-state ttdat) 'running) - (> (- curr-secs last-update) 3)) ;; every 3-4 seconds update the db? - (begin - (set! (file-modification-time (tt-servinf-file ttdat)) (current-seconds)) + (> (- curr-secs last-update) 5)) ;; every 5 seconds update the db? + (let* ((sinfo-file (tt-servinf-file ttdat))) + ;; (debug:print 0 *default-log-port* "INFO: touching "sinfo-file) + (set! (file-modification-time sinfo-file) (current-seconds)) ((dbr:dbstruct-sync-proc dbstruct) last-update) (dbr:dbstruct-last-update-set! dbstruct curr-secs)))) - + (if (< (- (current-seconds) (tt-last-access ttdat)) (tt-server-timeout-param)) (begin (thread-sleep! 5) (loop))))) - (cleanup) + (tt:shutdown-server ttdat) (debug:print 0 *default-log-port* "INFO: Server timed out, exiting from tt:keep-running."))) - -;; ;; given an already set up uconn start the cmd-loop -;; ;; -;; (define (tt:cmd-loop ttdat) -;; (let* ((serv-listener (-socket uconn)) -;; (listener (lambda () -;; (let loop ((state 'start)) -;; (let-values (((inp oup)(tcp-accept serv-listener))) -;; ;; (mutex-lock! *send-mutex*) ;; DOESN'T SEEM TO HELP -;; (let* ((rdat (deserialize inp)) ;; '(my-host-port qrykey cmd params) -;; (resp (ulex-handler uconn rdat))) -;; (serialize resp oup) -;; (close-input-port inp) -;; (close-output-port oup) -;; ;; (mutex-unlock! *send-mutex*) ;; DOESN'T SEEM TO HELP -;; ) -;; (loop state)))))) -;; ;; start N of them -;; (let loop ((thnum 0) -;; (threads '())) -;; (if (< thnum 100) -;; (let* ((th (make-thread listener (conc "listener" thnum)))) -;; (thread-start! th) -;; (loop (+ thnum 1) -;; (cons th threads))) -;; (map thread-join! threads))))) -;; -;; -;; -;; (define (wait-and-close uconn) -;; (thread-join! (udat-cmd-thread uconn)) -;; (tcp-close (udat-socket uconn))) -;; -;; (define (tt:shutdown-server ttdat) - (let* ((cleanproc (tt-cleanup-proc ttdat)) - (port (tt-port ttdat))) + (let* ((host (tt-host ttdat)) + (port (tt-port ttdat)) + (sinf (tt-servinf-file ttdat))) (tt-state-set! ttdat 'shutdown) (portlogger:open-run-close portlogger:set-port port "released") - (if cleanproc (cleanproc)) - (tcp-close (tt-socket ttdat)) ;; close up ports here + (if (file-exists? sinf) + (delete-file* sinf)) )) -;; (define (wait-and-close uconn) -;; (thread-join! (tt-cmd-thread uconn)) -;; (tcp-close (tt-socket uconn))) - ;; return servid ;; side-effects: ;; ttdat-cleanup-proc is populated with function to remove the serverinfo file (define (tt:create-server-registration-file ttdat dbfname) (let* ((areapath (tt-areapath ttdat)) (servdir (tt:get-servinfo-dir areapath)) (host (tt-host ttdat)) (port (tt-port ttdat)) (servinf (conc servdir"/"host":"port"-"(current-process-id)":"dbfname)) - (serv-id (tt:mk-signature areapath)) - (clean-proc (lambda () - (delete-file* servinf) - ))) + (serv-id (tt:mk-signature areapath))) (assert (and host port) "FATAL: tt:create-server-registration-file called with no conn, dbfname="dbfname) - (tt-cleanup-proc-set! ttdat clean-proc) (tt-servinf-file-set! ttdat servinf) (with-output-to-file servinf (lambda () (print "SERVER STARTED: "host":"port" AT "(current-seconds)" server-id: "serv-id" pid: "(current-process-id)" dbfname: "dbfname))) serv-id)) @@ -704,12 +711,28 @@ ;; if more than one, wait one second and look again ;; future: ping oldest, if alive remove other : files ;; (define (tt:find-server areapath dbfname) (let* ((servdir (tt:get-servinfo-dir areapath)) - (sfiles (glob (conc servdir"/*:"dbfname)))) - sfiles)) + (sfiles (glob (conc servdir"/*:"dbfname))) + (goodfiles '())) + + ;; filter the files here by looking in processes table (if we are not main.db) + ;; and or look at the time stamp on the servinfo file, a running server will + ;; touch the file every minute (again, this will only apply for main.db) + (for-each (lambda (fname) + (let* ((age (- (current-seconds)(file-modification-time fname)))) + (if (> age 200) ;; can't trust it if over 200 seconds old + (begin + (debug:print 0 *default-log-port* "WARNING: removing stale servinfo file "fname", it is "age" seconds old") + (handle-exceptions + exn + (debug:print 0 *default-log-port* "WARNING: error attempting to remove stale servinfo file "fname) + (delete-file fname))) ;; + (set! goodfiles (cons fname goodfiles))))) + sfiles) + goodfiles)) ;; given a path to a server info file return: host port startseconds server-id pid dbfname logf ;; example of what it's looking for in the log file: ;; SERVER STARTED: 10.38.175.67:50216 AT 1616502350.0 server-id: 4907e90fc55c7a09694e3f658c639cf4 ;; @@ -719,12 +742,13 @@ (dbprep-found 0) (bad-dat (list #f #f #f #f #f #f logf))) (let ((fdat (handle-exceptions exn (begin - ;; WARNING: this is potentially dangerous to blanket ignore the errors - (debug:print-info 0 *default-log-port* "Unable to get server info from "logf", exn="(condition->list exn)) + ;; BUG, TODO: add err checking, for now blanket ignore the errors? + (debug:print-info 0 *default-log-port* "Unable to get server info from "logf + ", exn="(condition->list exn)) '()) ;; no idea what went wrong, call it a bad server, return empty list (with-input-from-file logf read-lines)))) (if (null? fdat) ;; bad data, return bad-dat bad-dat (let loop ((inl (car fdat)) @@ -750,10 +774,17 @@ logf)) (else (debug:print 0 *default-log-port* "ERROR: did not recognise SERVER line info "mlst) bad-dat))))))))) +(define *last-server-start* (make-hash-table)) + +(define (tt:too-recent-server-start dbfname) + (let* ((last-run-time (hash-table-ref/default *last-server-start* dbfname #f))) + (and last-run-time + (< (- (current-seconds) last-run-time) 5)))) + ;; Given an area path, start a server process ### NOTE ### > file 2>&1 ;; if the target-host is set ;; try running on that host ;; incidental: rotate logs in logs/ dir. ;; @@ -760,51 +791,63 @@ (define (tt:server-process-run areapath testsuite mtexe run-id #!key (profile-mode "")) ;; areapath is *toppath* for a given testsuite area (assert areapath "FATAL: tt:server-process-run called without areapath defined.") (assert testsuite "FATAL: tt:server-process-run called without testsuite defined.") (assert mtexe "FATAL: tt:server-process-run called without mtexe defined.") ;; mtest -server - -m testsuite:ext-tests -db 6.db - (let* ((dbfname (dbmod:run-id->dbfname run-id)) - (load (get-normalized-cpu-load)) - (trying (length (tt:find-server areapath dbfname))) - (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) - (cond - ((> load 2.0) - (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server.") - (thread-sleep! 1)) - ((> nrun 100) - (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") - (thread-sleep! 1)) - ((> trying 4) - (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") - (thread-sleep! 1)) - (else - (if (not (file-exists? (conc areapath"/logs"))) - (create-directory (conc areapath"/logs") #t)) - (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) - (cmdln (conc - mtexe - " -startdir "areapath - " -server - ";; (or target-host "-") - " -m testsuite:"testsuite - " -db "dbfname ;; (dbmod:run-id->dbfname run-id) - " " profile-mode - (conc " >> " logfile " 2>&1 &")))) - ;; we want the remote server to start in *toppath* so push there - ;; (push-directory areapath) ;; use cd in the command line instead - (debug:print 0 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) - ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) - - (system cmdln) - ;; ;; use below to go back to nbfake - nbfake does cause trouble ... - ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... - ;; (setenv "NBFAKE_LOG" logfile) - ;; (system (conc "cd "areapath" ; nbfake " cmdln)) - ;; (unsetenv "NBFAKE_QUIET") - ;; (unsetenv "NBFAKE_LOG") - - ;;(pop-directory) - ))))) + (let* ((dbfname (dbmod:run-id->dbfname run-id))) + (if (tt:too-recent-server-start dbfname) + #f + (let* ((load (get-normalized-cpu-load)) + (srvrs (tt:find-server areapath dbfname)) + (trying (length srvrs)) + (nrun (number-of-processes-running (conc "mtest.*server.*"testsuite".*"dbfname)))) + (cond + ((> load 2.0) + (debug:print 0 *default-log-port* "Normalized load "load" on " (get-host-name) " is over the limit of 2.0. Not starting a server. Please reduce the load on "(get-host-name)" by killing some processes") + (thread-sleep! 1) + #f) + ((> nrun 100) + (debug:print 0 *default-log-port* nrun" servers running on " (get-host-name) ", not starting another.") + (thread-sleep! 1) + #f) + ((> trying 2) + (debug:print 0 *default-log-port* trying" servers registered in .servinfo dir. not starting another.") + (thread-sleep! 1) + #f) + (else + (if (not (file-exists? (conc areapath"/logs"))) + (create-directory (conc areapath"/logs") #t)) + (let* ((logfile (conc areapath "/logs/server-"dbfname"-"(current-process-id)".log")) ;; -" curr-pid "-" target-host ".log")) + (cmdln (conc + mtexe + " -startdir "areapath + " -server - ";; (or target-host "-") + " -m testsuite:"testsuite + " -db "dbfname ;; (dbmod:run-id->dbfname run-id) + " " profile-mode + #;(conc " >> " logfile " 2>&1 &")))) + ;; we want the remote server to start in *toppath* so push there + ;; (push-directory areapath) ;; use cd in the command line instead + (debug:print 2 *default-log-port* "INFO: Trying to start server in tcp mode (" cmdln ") at "(common:human-time)" for "areapath) + ;; (debug:print 0 *default-log-port* "INFO: starting server at " (common:human-time)) + + (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... + (setenv "NBFAKE_LOG" logfile) + (system (conc "cd "areapath" ; nbfake " cmdln)) + (unsetenv "NBFAKE_QUIET") + (unsetenv "NBFAKE_LOG") + ;; (system cmdln) + (hash-table-set! *last-server-start* dbfname (current-seconds)) + ;; ;; use below to go back to nbfake - nbfake does cause trouble ... + ;; (setenv "NBFAKE_QUIET" "yes") ;; BUG: change to with-environment-variable ... + ;; (setenv "NBFAKE_LOG" logfile) + ;; (system (conc "cd "areapath" ; nbfake " cmdln)) + ;; (unsetenv "NBFAKE_QUIET") + ;; (unsetenv "NBFAKE_LOG") + + ;;(pop-directory) + #t))))))) ;;====================================================================== ;; tcp connection stuff ;;====================================================================== @@ -868,18 +911,23 @@ ;; (connect-listener uconn port))) (define (setup-listener-portlogger uconn) (let ((port (portlogger:open-run-close portlogger:find-port))) (assert (tt? uconn) "FATAL: setup-listener called with wrong struct "uconn) + (debug:print 2 *default-log-port* "setup-listener-portlogger got port " port) (handle-exceptions exn (if (< port 65535) (begin (portlogger:open-run-close portlogger:set-failed port) (thread-sleep! 0.25) (setup-listener-portlogger uconn)) - #f) + (begin + (debug:print 0 *default-log-port* "setup-listener-portlogger: could not get a port") + #f + ) + ) (connect-listener uconn port)))) (define (connect-listener uconn port) ;; (tcp-listener-socket LISTENER)(socket-name so) ;; sockaddr-address, sockaddr-port, sockaddr->string Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1435,19 +1435,19 @@ (out-dir (db:test-get-rundir test-dat)) (status-file (conc out-dir "/.final-status")) ) ;; first verify we are able to write the output file (if (not (file-write-access? out-dir)) - (debug:print 0 *default-log-port* "ERROR: cannot write .final-status to " out-dir) - (let* - ((outp (open-output-file status-file)) + (debug:print 0 *default-log-port* "ERROR: cannot write .final-status to " out-dir) + (let* ((outp (open-output-file status-file)) (status (db:test-get-status test-dat)) - (state (db:test-get-state test-dat))) - (fprintf outp "~S\n" state) - (fprintf outp "~S\n" status) - (close-output-port outp))))) - + (state (db:test-get-state test-dat))) + (with-output-to-port outp + (lambda () + (print state) ;; printf was putting in ", not sure why but that was a hassle in other contexts + (print status))) + (close-output-port outp))))) ;; summarize test in to a file test-summary.html in the test directory ;; (define (tests:summarize-test run-id test-id) (let* ((test-dat (rmt:get-test-info-by-id run-id test-id)) @@ -1964,11 +1964,12 @@ ;; test steps ;;====================================================================== ;; teststep-set-status! used to be here -(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) +;; NOT NEEDED +#;(define (test-get-kill-request run-id test-id) ;; run-id test-name itemdat) (let* ((testdat (rmt:get-test-state-status-by-id run-id test-id))) (and testdat (equal? (car testdat) "KILLREQ")))) (define (test:tdb-get-rundat-count tdb) @@ -1993,11 +1994,11 @@ ;; This one is for running with no db access (i.e. via rmt: internally) (define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) - (let* ((cpuload (get-cpu-load)) + (let* ((cpuload (commonmod:get-cpu-load)) (diskfree (get-df (current-directory))) (uname (get-uname "-srvpio")) (hostname (get-host-name))) (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) ADDED transport-mode.scm Index: transport-mode.scm ================================================================== --- /dev/null +++ transport-mode.scm @@ -0,0 +1,22 @@ +;;====================================================================== +;; set up transport, db cache and sync methods +;; +;; sync-method: 'original, 'attach or 'none +;; cache-method: 'tmp 'none +;; rmt:transport-mode: 'http, 'tcp, 'nfs +;; +;; NOTE: NOT ALL COMBINATIONS WORK +;; +;;====================================================================== + +;; uncomment this block to test without tcp +;; (dbfile:sync-method 'none) +;; (dbfile:cache-method 'none) +;; (rmt:transport-mode 'nfs) + +;; uncomment this block to test with tcp +(dbfile:sync-method 'attach) ;; attach) ;; original +(dbfile:cache-method 'tmp) +(rmt:transport-mode 'tcp) + + Index: utils/mt_xterm ================================================================== --- utils/mt_xterm +++ utils/mt_xterm @@ -20,18 +20,16 @@ MT_TMPDISPLAY=$DISPLAY MT_TMPUSER=$USER MT_HOME=$HOME tmpfile=`mktemp` - -grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile -source $tmpfile -rm $tmpfile - -# if [ -e megatest.sh ];then -#source megatest.sh -#fi +if [[ -e megatest.sh ]]; then + grep -v "export USER=" megatest.sh | grep -v "export HOME=" > $tmpfile + source $tmpfile + rm $tmpfile +fi + export DISPLAY=$MT_TMPDISPLAY export USER=$USER export HOME=$MT_HOME if [ x"$MT_XTERM_CMD" == "x" ];then