Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -52,10 +52,44 @@ csc $(CSCOPTS) $(OFILES) $(GOFILES) newdashboard.scm -o ndboard mtut: $(OFILES) mtut.scm csc $(CSCOPTS) $(OFILES) mtut.scm -o mtut +TCMTOBJS = \ + api.o \ + archive.o \ + cgisetup/models/pgdb.o \ + client.o \ + common.o \ + configf.o \ + daemon.o \ + db.o \ + env.o \ + http-transport.o \ + items.o \ + keys.o \ + launch.o \ + lock-queue.o \ + margs.o \ + mt.o \ + megatest-version.o \ + ods.o \ + portlogger.o \ + process.o \ + rmt.o \ + rpc-transport.o \ + runconfig.o \ + runs.o \ + server.o \ + tasks.o \ + tdb.o \ + tests.o \ + + +tcmt : $(TCMTOBJS) tcmt.scm + csc $(CSCOPTS) $(TCMTOBJS) tcmt.scm -o tcmt + # install documentation to $(PREFIX)/docs # DOES NOT REBUILD DOCS # $(PREFIX)/share/docs/megatest_manual.html : docs/manual/megatest_manual.html mkdir -p $(PREFIX)/share/docs @@ -114,10 +148,17 @@ $(INSTALL) mtut $(PREFIX)/bin/.$(ARCHSTR)/mtut $(PREFIX)/bin/mtutil : $(PREFIX)/bin/.$(ARCHSTR)/mtut utils/mk_wrapper utils/mk_wrapper $(PREFIX) mtut $(PREFIX)/bin/mtutil chmod a+x $(PREFIX)/bin/mtutil + +$(PREFIX)/bin/.$(ARCHSTR)/tcmt : tcmt + $(INSTALL) tcmt $(PREFIX)/bin/.$(ARCHSTR)/tcmt + +$(PREFIX)/bin/tcmt : $(PREFIX)/bin/.$(ARCHSTR)/tcmt utils/mk_wrapper + utils/mk_wrapper $(PREFIX) tcmt $(PREFIX)/bin/tcmt + chmod a+x $(PREFIX)/bin/tcmt # $(PREFIX)/bin/.$(ARCHSTR)/mdboard : multi-dboard # $(INSTALL) multi-dboard $(PREFIX)/bin/.$(ARCHSTR)/mdboard # $(PREFIX)/bin/mdboard : $(PREFIX)/bin/.$(ARCHSTR)/mdboard utils/mk_wrapper @@ -193,11 +234,13 @@ install : $(PREFIX)/bin/.$(ARCHSTR) $(PREFIX)/bin/.$(ARCHSTR)/mtest $(PREFIX)/bin/megatest \ $(PREFIX)/bin/.$(ARCHSTR)/dboard $(PREFIX)/bin/dashboard $(HELPERS) $(PREFIX)/bin/nbfake \ $(PREFIX)/bin/nbfind $(PREFIX)/bin/loadrunner $(PREFIX)/bin/viewscreen $(PREFIX)/bin/mt_xterm \ $(PREFIX)/share/docs/megatest_manual.html $(PREFIX)/bin/remrun $(PREFIX)/bin/mtutil \ - $(PREFIX)/share/db/mt-pg.sql $(PREFIX)/bin/.$(ARCHSTR)/ndboard $(PREFIX)/bin/newdashboard + $(PREFIX)/share/db/mt-pg.sql $(PREFIX)/bin/.$(ARCHSTR)/ndboard $(PREFIX)/bin/tcmt + +# $(PREFIX)/bin/newdashboard $(PREFIX)/bin/.$(ARCHSTR) : mkdir -p $(PREFIX)/bin/.$(ARCHSTR) mkdir -p $(PREFIX)/bin/.$(ARCHSTR)/lib @@ -304,5 +347,9 @@ fi portlogger-example : portlogger-example.scm api.o archive.o client.o common.o configf.o daemon.o dashboard-tests.o db.o dcommon.o ezsteps.o filedb.o genexample.o gutils.o http-transport.o items.o keys.o launch.o lock-queue.o margs.o megatest-version.o mt.o ods.o portlogger.o process.o rmt.o rpc-transport.o runconfig.o runs.o sdb.o server.o synchash.o tasks.o tdb.o tests.o tree.o csc $(CSCOPTS) portlogger-example.scm api.o archive.o client.o common.o configf.o daemon.o dashboard-tests.o db.o dcommon.o ezsteps.o filedb.o genexample.o gutils.o http-transport.o items.o keys.o launch.o lock-queue.o margs.o megatest-version.o mt.o ods.o portlogger.o process.o rmt.o rpc-transport.o runconfig.o runs.o sdb.o server.o synchash.o tasks.o tdb.o tests.o tree.o +# create a pdf dot graphviz diagram from notations in rmt.scm +rmt.pdf : rmt.scm + grep ';;DOT' rmt.scm | sed -e 's/.*;;DOT //' > rmt.dot;dot -Tpdf rmt.dot -o rmt.pdf + Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -56,10 +56,11 @@ get-run-ids-matching-target get-runs-by-patt get-steps-data get-steps-for-test read-test-data + read-test-data* login tasks-get-last testmeta-get-record have-incompletes? synchash-get @@ -126,10 +127,11 @@ (cond ((not (vector? dat)) ;; it is an error to not receive a vector (vector #f (vector #f "remote must be called with a vector"))) ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") + (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else (let* ((cmd-in (vector-ref dat 0)) (cmd (if (symbol? cmd-in) cmd-in @@ -152,11 +154,26 @@ ;; SERVERS ((start-server) (apply server:kind-run params)) ((kill-server) (set! *server-run* #f)) ;; TESTS - ((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) + + ;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) + ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items. + ((test-set-state-status-by-id) + + ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) + (db:set-state-status-and-roll-up-items + dbstruct + (list-ref params 0) ; run-id + (list-ref params 1) ; test-name + #f ; item-path + (list-ref params 2) ; state + (list-ref params 3) ; status + (list-ref params 4) ; comment + )) + ((delete-test-records) (apply db:delete-test-records dbstruct params)) ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) ((test-set-state-status) (apply db:test-set-state-status dbstruct params)) ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params)) @@ -193,10 +210,15 @@ ;; TASKS ((tasks-add) (apply tasks:add dbstruct params)) ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params)) ((tasks-get-last) (apply tasks:get-last dbstruct params)) + ;; NO SYNC DB + ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) + ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) + ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) + ;; ARCHIVES ;; ((archive-get-allocations) ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) @@ -257,10 +279,11 @@ ((get-steps-data) (apply db:get-steps-data dbstruct params)) ((get-steps-for-test) (apply db:get-steps-for-test dbstruct params)) ;; TEST DATA ((read-test-data) (apply db:read-test-data dbstruct params)) + ((read-test-data*) (apply db:read-test-data* dbstruct params)) ;; MISC ((get-latest-host-load) (apply db:get-latest-host-load dbstruct params)) ((have-incompletes?) (apply db:have-incompletes? dbstruct params)) ((login) (apply db:login dbstruct params)) Index: archive.scm ================================================================== --- archive.scm +++ archive.scm @@ -29,11 +29,11 @@ (flavor 'plain) ;; type of machine to run jobs on (maxload 1.5) ;; max allowed load for this work (adisks (archive:get-archive-disks))) ;; get testdir size ;; - hand off du to job mgr - (if (and (file-exists? testdir) + (if (and (common:file-exists? testdir) (file-is-writable? testdir)) (let* ((dused (jobrunner:run-job flavor ;; machine type maxload ;; max allowed load '() ;; prevars - environment vars to set for the job @@ -136,11 +136,11 @@ (> (rmt:test-toplevel-num-items run-id test-name) 0))) (test-partial-path (conc target "/" run-name "/" (db:test-make-full-name test-name item-path))) ;; note the trailing slash to get the dir inspite of it being a link (test-path (conc linktree "/" test-partial-path)) (mutex-lock! rp-mutex) - (test-physical-path (if (file-exists? test-path) + (test-physical-path (if (common:file-exists? test-path) (common:real-path test-path) #f)) (mutex-unlock! rp-mutex) (partial-path-index (if test-physical-path (substring-index test-partial-path test-physical-path) #f)) (test-base (if (and partial-path-index @@ -151,11 +151,11 @@ #f))) (cond (toplevel/children (debug:print 0 *default-log-port* "WARNING: cannot archive " test-name " with id " test-id " as it is a toplevel test with children")) - ((not (file-exists? test-path)) + ((not (common:file-exists? test-path)) (debug:print 0 *default-log-port* "WARNING: Cannot archive " test-name "/" item-path " as path " test-path " does not exist")) (else (debug:print 0 *default-log-port* "From test-dat=" test-dat " derived the following:\n" "test-partial-path = " test-partial-path "\n" @@ -179,13 +179,13 @@ (conc "-" compress) ;; or (conc "--compress=" compress) "-n" (conc (common:get-testsuite-name) "-" run-id) (conc "--strip-path=" disk-group)) test-paths)) (print-prefix #f)) ;; "Running: ")) ;; change to #f to turn off printing - (if (not (file-exists? archive-dir)) + (if (not (common:file-exists? archive-dir)) (create-directory archive-dir #t)) - (if (not (file-exists? (conc archive-dir "/HEAD"))) + (if (not (common:file-exists? (conc archive-dir "/HEAD"))) (begin ;; replace this with jobrunner stuff enventually (debug:print-info 0 *default-log-port* "Init bup in " archive-dir) ;; (mutex-lock! bup-mutex) (run-n-wait bup-exe params: bup-init-params print-cmd: print-prefix) @@ -232,11 +232,11 @@ (test-partial-path (conc target "/" run-name "/" (db:test-make-full-name test-name item-path))) ;; note the trailing slash to get the dir inspite of it being a link (test-path (conc linktree "/" test-partial-path)) ;; if the old path was not deleted then prev-test-physical-path will end up pointing to a real directory (mutex-lock! rp-mutex) - (prev-test-physical-path (if (file-exists? test-path) + (prev-test-physical-path (if (common:file-exists? test-path) ;; (read-symbolic-link test-path #t) (common:real-path test-path) #f)) (mutex-unlock! rp-mutex) (new-test-physical-path (conc best-disk "/" test-partial-path)) @@ -249,11 +249,11 @@ ;; some sanity checks, move an existing path out of the way - iif it is not a toplevel with children ;; (if (and (not toplevel/children) ;; special handling needed for toplevel with children prev-test-physical-path - (file-exists? prev-test-physical-path)) ;; what to do? abort or clean up or link it in? + (common:file-exists? prev-test-physical-path)) ;; what to do? abort or clean up or link it in? (let* ((base (pathname-directory prev-test-physical-path)) (dirn (pathname-file prev-test-physical-path)) (newn (conc base "/." dirn))) (debug:print-error 0 *default-log-port* "the old directory " prev-test-physical-path ", still exists! Moving it to " newn) (rename-file prev-test-physical-path newn))) Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -43,11 +43,11 @@ (debug:print-error 0 *default-log-port* "bad value for setenv, key=" key ", value=" val)))) (define home (getenv "HOME")) (define user (getenv "USER")) -;; GLOBAL GLETCHES +;; GLOBALS ;; CONTEXTS (defstruct cxt (taskdb #f) (cmutex (make-mutex))) @@ -108,10 +108,12 @@ (define *db-access-mutex* (make-mutex)) (define *db-transaction-mutex* (make-mutex)) (define *db-cache-path* #f) (define *db-with-db-mutex* (make-mutex)) (define *db-api-call-time* (make-hash-table)) ;; hash of command => (list of times) +;; no sync db +(define *no-sync-db* #f) ;; SERVER (define *my-client-signature* #f) (define *transport-type* 'http) ;; override with [server] transport http|rpc|nmsg (define *runremote* #f) ;; if set up for server communication this will hold @@ -126,10 +128,11 @@ (define *home-host* #f) ;; (define *total-non-write-delay* 0) (define *heartbeat-mutex* (make-mutex)) (define *api-process-request-count* 0) (define *max-api-process-requests* 0) +(define *server-overloaded* #f) ;; client (define *rmt-mutex* (make-mutex)) ;; remote access calls mutex ;; RPC transport @@ -146,10 +149,13 @@ (define *run-info-cache* (make-hash-table)) ;; run info is stable, no need to reget (define *launch-setup-mutex* (make-mutex)) ;; need to be able to call launch:setup often so mutex it and re-call the real deal only if *toppath* not set (define *homehost-mutex* (make-mutex)) +;; Miscellaneous +(define *triggers-mutex* (make-mutex)) ;; block overlapping processing of triggers + (defstruct remote (hh-dat (common:get-homehost)) ;; homehost record ( addr . hhflag ) (server-url (if *toppath* (server:check-if-running *toppath*))) ;; (server:check-if-running *toppath*) #f)) (last-server-check 0) ;; last time we checked to see if the server was alive (conndat #f) @@ -233,26 +239,27 @@ (not (equal? (common:get-last-run-version) (common:version-signature)))) (define (common:api-changed?) (not (equal? (substring (->string megatest-version) 0 4) - (substring (common:get-last-run-version) 0 4)))) + (substring (conc (common:get-last-run-version)) 0 4)))) ;; Move me elsewhere ... ;; RADT => Why do we meed the version check here, this is called only if version misma ;; -(define (common:cleanup-db dbstruct) - (db:multi-db-sync +(define (common:cleanup-db dbstruct #!key (full #f)) + (apply db:multi-db-sync dbstruct 'schema ;; 'new2old 'killservers - 'dejunk 'adj-target ;; 'old2new 'new2old - ) + (if full + '(dejunk) + '())) (if (common:api-changed?) (common:set-last-run-version))) ;; Rotate logs, logic: ;; if > 500k and older than 1 week: @@ -273,11 +280,11 @@ (> (file-size fullname) 200000)) (and (string-match "^server-.*.log" file) (> (- (current-seconds) (file-modification-time fullname)) (* 8 60 60)))) (let ((gzfile (conc fullname ".gz"))) - (if (file-exists? gzfile) + (if (common:file-exists? gzfile) (begin (debug:print-info 0 *default-log-port* "removing " gzfile) (delete-file gzfile))) (debug:print-info 0 *default-log-port* "compressing " file) (system (conc "gzip " fullname))) @@ -303,11 +310,11 @@ "WARNING: Version mismatch!\n" " expected: " (common:version-signature) "\n" " got: " (common:get-last-run-version)) (cond ((get-environment-variable "MT_SKIP_DB_MIGRATE") #t) - ((and (file-exists? mtconf) (file-exists? dbfile) (not read-only) + ((and (common:file-exists? mtconf) (common:file-exists? dbfile) (not read-only) (eq? (current-user-id)(file-owner mtconf))) ;; safe to run -cleanup-db (debug:print 0 *default-log-port* " I see you are the owner of megatest.config, attempting to cleanup and reset to new version") (handle-exceptions exn (begin @@ -314,14 +321,14 @@ (debug:print 0 *default-log-port* "Failed to switch versions.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (print-call-chain (current-error-port)) (exit 1)) (common:cleanup-db dbstruct))) - ((not (file-exists? mtconf)) + ((not (common:file-exists? mtconf)) (debug:print 0 *default-log-port* " megatest.config does not exist in this area. Cannot proceed with megatest version migration.") (exit 1)) - ((not (file-exists? dbfile)) + ((not (common:file-exists? dbfile)) (debug:print 0 *default-log-port* " megatest.db does not exist in this area. Cannot proceed with megatest version migration.") (exit 1)) ((not (eq? (current-user-id)(file-owner mtconf))) (debug:print 0 *default-log-port* " You do not own megatest.db in this area. Cannot proceed with megatest version migration.") (exit 1)) @@ -328,14 +335,14 @@ (read-only (debug:print 0 *default-log-port* " You have read-only access to this area. Cannot proceed with megatest version migration.") (exit 1)) (else (debug:print 0 *default-log-port* " to switch versions you can run: \"megatest -cleanup-db\"") - (exit 1))))) - (begin - (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") - (exit 1)))) + (exit 1))))))) +;; (begin +;; (debug:print 0 *default-log-port* "ERROR: cannot migrate version unless on homehost. Exiting.") +;; (exit 1)))) ;;====================================================================== ;; S P A R S E A R R A Y S ;;====================================================================== @@ -431,11 +438,11 @@ ;; (define (common:simple-file-lock fname #!key (expire-time 300)) (handle-exceptions exn #f ;; don't really care what went wrong right now. NOTE: I have not seen this one actually fail. - (if (file-exists? fname) + (if (common:file-exists? fname) (if (> (- (current-seconds)(file-modification-time fname)) expire-time) (begin (delete-file* fname) (common:simple-file-lock fname expire-time: expire-time)) #f) @@ -442,11 +449,11 @@ (let ((key-string (conc (get-host-name) "-" (current-process-id)))) (with-output-to-file fname (lambda () (print key-string))) (thread-sleep! 0.25) - (if (file-exists? fname) + (if (common:file-exists? fname) (with-input-from-file fname (lambda () (equal? key-string (read-line)))) #f))))) @@ -634,23 +641,10 @@ ;; (let ((ohh (common:on-homehost?)) ;; (srv (args:get-arg "-server"))) ;; (and ohh srv))) ;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv) - -;;;; run-ids -;; if #f use *db-local-sync* : or 'local-sync-flags -;; if #t use timestamps : or 'timestamps -(define (common:sync-to-megatest.db dbstruct) - (let ((start-time (current-seconds)) - (res (db:multi-db-sync dbstruct 'new2old))) - (let ((sync-time (- (current-seconds) start-time))) - (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)) - (if (common:low-noise-print 30 "sync new to old") - (debug:print-info 0 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)))) - res)) - (define *wdnum* 0) (define *wdnum*mutex (make-mutex)) @@ -677,102 +671,34 @@ (< duration-since-last-sync sync-cool-off-duration)) (thread-sleep! (- sync-cool-off-duration duration-since-last-sync))) (if (not *time-to-exit*) (let ((golden-mtdb-mtime (file-modification-time golden-mtpath)) (tmp-mtdb-mtime (file-modification-time tmp-mtpath))) - (if (> golden-mtdb-mtime tmp-mtdb-mtime) - (let ((res (db:multi-db-sync dbstruct 'old2new))) - (debug:print-info 13 *default-log-port* "rosync called, " res " records transferred."))) + (if (> golden-mtdb-mtime tmp-mtdb-mtime) + (if (< golden-mtdb-mtime (- (current-seconds) 3)) ;; file has NOT been touched in past three seconds, this way multiple servers won't fight to sync back + (let ((res (db:multi-db-sync dbstruct 'old2new))) + (debug:print-info 13 *default-log-port* "rosync called, " res " records transferred.")))) (loop (current-seconds))) #t))) (debug:print-info 0 *default-log-port* "Exiting readonly-watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" mtpath="golden-mtpath))) - - -(define (common:writable-watchdog dbstruct) - (thread-sleep! 0.05) ;; delay for startup - (let ((legacy-sync (common:run-sync?)) - (debug-mode (debug:debug-mode 1)) - (last-time (current-seconds)) - (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) - (debug:print-info 2 *default-log-port* "Periodic sync thread started.") - (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) - (if (and legacy-sync (not *time-to-exit*)) - (let* (;;(dbstruct (db:setup)) - (mtdb (dbr:dbstruct-mtdb dbstruct)) - (mtpath (db:dbdat-get-path mtdb))) - (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") - (let loop () - ;; sync for filesystem local db writes - ;; - (mutex-lock! *db-multi-sync-mutex*) - (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write - (sync-in-progress *db-sync-in-progress*) - (should-sync (and (not *time-to-exit*) - (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum - (start-time (current-seconds)) - (mt-mod-time (file-modification-time mtpath)) - (recently-synced (< (- start-time mt-mod-time) 4)) - (will-sync (and (or need-sync should-sync) - (not sync-in-progress) - (not recently-synced)))) - (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress="sync-in-progress" should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync) - ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) - ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) - (if will-sync (set! *db-sync-in-progress* #t)) - (mutex-unlock! *db-multi-sync-mutex*) - (if will-sync - (let ((res (common:sync-to-megatest.db dbstruct))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive - (if (> res 0) ;; some records were transferred, keep the db alive - (begin - (mutex-lock! *heartbeat-mutex*) - (set! *db-last-access* (current-seconds)) - (mutex-unlock! *heartbeat-mutex*) - (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) - (debug:print-info 2 *default-log-port* "sync called but zero records transferred")))) - (if will-sync - (begin - (mutex-lock! *db-multi-sync-mutex*) - (set! *db-sync-in-progress* #f) - (set! *db-last-sync* start-time) - (mutex-unlock! *db-multi-sync-mutex*))) - (if (and debug-mode - (> (- start-time last-time) 60)) - (begin - (set! last-time start-time) - (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) - - ;; keep going unless time to exit - ;; - (if (not *time-to-exit*) - (let delay-loop ((count 0)) - ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) - - (if (and (not *time-to-exit*) - (< count 4)) ;; was 11, changing to 4. - (begin - (thread-sleep! 1) - (delay-loop (+ count 1)))) - (if (not *time-to-exit*) (loop)))) - (if (common:low-noise-print 30) - (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num))))))) - ;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage (define (common:watchdog) (debug:print-info 13 *default-log-port* "common:watchdog entered.") - (if (common:on-homehost?) - (let ((dbstruct (db:setup #t))) - (debug:print-info 13 *default-log-port* "after db:setup with dbstruct="dbstruct) - (cond - ((dbr:dbstruct-read-only dbstruct) - (debug:print-info 13 *default-log-port* "loading read-only watchdog") - (common:readonly-watchdog dbstruct)) - (else - (debug:print-info 13 *default-log-port* "loading writable-watchdog.") - (common:writable-watchdog dbstruct))) - (debug:print-info 13 *default-log-port* "watchdog done.")) - (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))) + (if (launch:setup) + (if (common:on-homehost?) + (let ((dbstruct (db:setup #t))) + (debug:print-info 13 *default-log-port* "after db:setup with dbstruct=" dbstruct) + (cond + ((dbr:dbstruct-read-only dbstruct) + (debug:print-info 13 *default-log-port* "loading read-only watchdog") + (common:readonly-watchdog dbstruct)) + (else + (debug:print-info 13 *default-log-port* "loading writable-watchdog.") + (server:writable-watchdog dbstruct))) + (debug:print-info 13 *default-log-port* "watchdog done.")) + (debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost")))) (define (std-exit-procedure) (on-exit (lambda () 0)) ;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*) @@ -792,14 +718,15 @@ (begin (sqlite3:interrupt! db) (sqlite3:finalize! db #t) ;; (vector-set! *task-db* 0 #f) (set! *task-db* #f))))) - (if (and *runremote* - (remote-conndat *runremote*)) - (begin - (http-client#close-all-connections!))) ;; for http-client + (http-client#close-all-connections!) + ;; (if (and *runremote* + ;; (remote-conndat *runremote*)) + ;; (begin + ;; (http-client#close-all-connections!))) ;; for http-client (if (not (eq? *default-log-port* (current-error-port))) (close-output-port *default-log-port*)) (set! *default-log-port* (current-error-port))) "Cleanup db exit thread")) (th2 (make-thread (lambda () (debug:print 4 *default-log-port* "Attempting clean exit. Please be patient and wait a few seconds...") @@ -874,19 +801,19 @@ #f (let loop ((hed (car cmds)) (tal (cdr cmds))) (let ((res (with-input-from-pipe (conc "which " hed) read-line))) (if (and (string? res) - (file-exists? res)) + (common:file-exists? res)) res (if (null? tal) #f (loop (car tal)(cdr tal)))))))) (define (common:get-install-area) (let ((exe-path (car (argv)))) - (if (file-exists? exe-path) + (if (common:file-exists? exe-path) (handle-exceptions exn #f (pathname-directory (pathname-directory @@ -1123,11 +1050,11 @@ (begin (mutex-unlock! *homehost-mutex*) (debug:print 0 *default-log-port* "ERROR: Failed to read .homehost file after trying five times. Giving up and exiting, message: " ((condition-property-accessor 'exn 'message) exn)) (exit 1))) (let ((hhf (conc *toppath* "/.homehost"))) - (if (file-exists? hhf) + (if (common:file-exists? hhf) (with-input-from-file hhf read-line) (if (file-write-access? *toppath*) (begin (with-output-to-file hhf (lambda () @@ -1152,11 +1079,11 @@ ;; do we honor the caches of the config files? ;; (define (common:use-cache?) (let ((res #t)) ;; priority by order of evaluation - (if *configdat* + (if *configdat* ;; sillyness here. can't use setup/use-cache to know if we can use the cached files! (if (equal? (configf:lookup *configdat* "setup" "use-cache") "no") (set! res #f) (if (equal? (configf:lookup *configdat* "setup" "use-cache") "yes") (set! res #t)))) (if (args:get-arg "-no-cache")(set! res #f)) ;; overrides setting in "setup" @@ -1556,31 +1483,36 @@ (set! best-load load) (set! best-host hostname))))) hosts) best-host)) - - - (define (common:wait-for-cpuload maxload numcpus waitdelay #!key (count 1000) (msg #f)(remote-host #f)) (let* ((loadavg (common:get-cpu-load remote-host)) (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload numcpus)) (loadjmp (- first next))) (cond ((and (> first adjload) (> count 0)) - (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load " first " exceeding max of " adjload (if msg msg "")) + (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load " first " exceeding max of " adjload " " (if msg msg "")) (thread-sleep! waitdelay) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1))) ((and (> loadjmp numcpus) (> count 0)) (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) (thread-sleep! waitdelay) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1)))))) +(define (common:wait-for-homehost-load maxload msg) + (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. + #f + (common:get-homehost))) + (hh (if hh-dat (car hh-dat) #f)) + (numcpus (common:get-num-cpus hh))) + (common:wait-for-normalized-load maxload msg: msg remote-host: hh))) + (define (common:get-num-cpus remote-host) (let ((proc (lambda () (let loop ((numcpu 0) (inl (read-line))) (if (eof-object? inl) @@ -1597,11 +1529,11 @@ ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxload #!key (msg #f)(remote-host #f)) (let ((num-cpus (common:get-num-cpus remote-host))) - (common:wait-for-cpuload maxload num-cpus 15 msg: msg))) + (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) (define (get-uname . params) (let* ((uname-res (process:cmd-run->list (conc "uname " (if (null? params) "-a" (car params))))) (uname #f)) (if (null? (car uname-res)) @@ -1755,11 +1687,12 @@ (let* ((key (car keyval)) (val (cdr keyval)) (delim (if (string-search whitesp val) "\"" ""))) - (print (if (member key ignorevars) + (print (if (or (member key ignorevars) + (string-search whitesp key)) "# setenv " "setenv ") key " " delim (mungeval val) delim))) envvars))) (with-output-to-file (conc fname ".sh") @@ -1769,10 +1702,11 @@ (val (cdr keyval)) (delim (if (string-search whitesp val) "\"" ""))) (print (if (or (member key ignorevars) + (string-search whitesp key) (string-search ":" key)) ;; internal only values to be skipped. "# export " "export ") key "=" delim (mungeval val) delim))) envvars))))) @@ -1786,11 +1720,11 @@ (let* ((var (car p)) (val (cadr p)) (prv (get-environment-variable var))) (set! res (cons (list var prv) res)) (if val - (setenv var (->string val)) + (safe-setenv var (->string val)) (unsetenv var)))) lst) res) '())) @@ -2138,21 +2072,29 @@ (number->string x 16)) (map string->number (string-split instr))) "/")) -(define (common:faux-lock keyname) - (if (rmt:get-var keyname) - #f +(define (common:faux-lock keyname #!key (wait-time 8)) + (if (rmt:no-sync-get/default keyname #f) ;; do not be tempted to compare to pid. locking is a one-shot action, if already locked for this pid it doesn't actually count + (if (> wait-time 0) + (begin + (thread-sleep! 1) + (if (eq? wait-time 1) ;; only one second left, steal the lock + (begin + (debug:print-info 0 *default-log-port* "stealing lock for " keyname) + (common:faux-unlock keyname force: #t))) + (common:faux-lock keyname wait-time: (- wait-time 1))) + #f) (begin - (rmt:set-var keyname (conc (current-process-id))) - (equal? (conc (current-process-id)) (conc (rmt:get-var keyname)))))) + (rmt:no-sync-set keyname (conc (current-process-id))) + (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))))) (define (common:faux-unlock keyname #!key (force #f)) - (if (or force (equal? (conc (current-process-id)) (conc (rmt:get-var keyname)))) + (if (or force (equal? (conc (current-process-id)) (conc (rmt:no-sync-get/default keyname #f)))) (begin - (if (rmt:get-var keyname) (rmt:del-var keyname)) + (if (rmt:no-sync-get/default keyname #f) (rmt:no-sync-del! keyname)) #t) #f)) (define (common:in-running-test?) @@ -2306,14 +2248,14 @@ ;; (define (common:load-views-config) (let* ((view-cfgdat (make-hash-table)) (home-cfgfile (conc (get-environment-variable "HOME") "/.mtviews.config")) (mthome-cfgfile (conc *toppath* "/.mtviews.config"))) - (if (file-exists? mthome-cfgfile) + (if (common:file-exists? mthome-cfgfile) (read-config mthome-cfgfile view-cfgdat #t)) ;; we load the home dir file AFTER the MTRAH file so the user can clobber settings when running the dashboard in read-only areas - (if (file-exists? home-cfgfile) + (if (common:file-exists? home-cfgfile) (read-config home-cfgfile view-cfgdat #t)) view-cfgdat)) ;;====================================================================== ;; Manage pkts, used in servers, tests and likely other contexts so put @@ -2341,50 +2283,62 @@ (pktsdirs (if pktsdirs-str (string-split pktsdirs-str " ") #f))) pktsdirs)) +;; use-lt is use linktree "lt" link to find pkts dir (define (common:with-queue-db mtconf proc #!key (use-lt #f)(toppath-in #f)) (let* ((pktsdirs (common:get-pkts-dirs mtconf use-lt)) (pktsdir (if pktsdirs (car pktsdirs) #f)) (toppath (or (configf:lookup mtconf "scratchdat" "toppath") toppath-in)) (pdbpath (or (configf:lookup mtconf "setup" "pdbpath") pktsdir))) - (if (not (and pktsdir toppath pdbpath)) - (begin - (print "ERROR: settings are missing in your megatest.config for area management.") - (print " you need to have pktsdir in the [setup] section.")) + (cond + ((not (and pktsdir toppath pdbpath)) + (debug:print 0 *default-log-port* "ERROR: settings are missing in your megatest.config for area management.") + (debug:print 0 *default-log-port* " you need to have pktsdir in the [setup] section.")) + ((not (common:file-exists? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: pkts directory not found " pktsdir)) + ((not (equal? (file-owner pktsdir)(current-effective-user-id))) + (debug:print 0 *default-log-port* "ERROR: directory " pktsdir " is not owned by " (current-effective-user-name))) + (else (let* ((pdb (open-queue-db pdbpath "pkts.db" schema: '("CREATE TABLE groups (id INTEGER PRIMARY KEY,groupname TEXT, CONSTRAINT group_constraint UNIQUE (groupname));")))) (proc pktsdirs pktsdir pdb) - (dbi:close pdb))))) + (dbi:close pdb)))))) (define (common:load-pkts-to-db mtconf) (common:with-queue-db mtconf (lambda (pktsdirs pktsdir pdb) (for-each (lambda (pktsdir) ;; look at all - (if (and (file-exists? pktsdir) - (directory? pktsdir) - (file-read-access? pktsdir)) - (let ((pkts (glob (conc pktsdir "/*.pkt")))) - (for-each - (lambda (pkt) - (let* ((uuid (cadr (string-match ".*/([0-9a-f]+).pkt" pkt))) - (exists (lookup-by-uuid pdb uuid #f))) - (if (not exists) - (let* ((pktdat (string-intersperse - (with-input-from-file pkt read-lines) - "\n")) - (apkt (pkt->alist pktdat)) - (ptype (alist-ref 'T apkt))) - (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0) - (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue")) - (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...") - ))) - pkts)))) + (cond + ((not (common:file-exists? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory " pktsdir " does not exist.")) + ((not (directory? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not a directory.")) + ((not (file-read-access? pktsdir)) + (debug:print 0 *default-log-port* "ERROR: packets directory path " pktsdir " is not readable.")) + (else + (debug:print-info 0 *default-log-port* "Loading packets found in " pktsdir) + (let ((pkts (glob (conc pktsdir "/*.pkt")))) + (for-each + (lambda (pkt) + (let* ((uuid (cadr (string-match ".*/([0-9a-f]+).pkt" pkt))) + (exists (lookup-by-uuid pdb uuid #f))) + (if (not exists) + (let* ((pktdat (string-intersperse + (with-input-from-file pkt read-lines) + "\n")) + (apkt (pkt->alist pktdat)) + (ptype (alist-ref 'T apkt))) + (add-to-queue pdb pktdat uuid (or ptype 'cmd) #f 0) + (debug:print 4 *default-log-port* "Added " uuid " of type " ptype " to queue")) + (debug:print 4 *default-log-port* "pkt: " uuid " exists, skipping...") + ))) + pkts))))) pktsdirs)))) (define (common:get-pkt-alists pkts) (map (lambda (x) (alist-ref 'apkt x)) ;; 'pkta pulls out the alist from the read pkt Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -23,18 +23,18 @@ ;; return list (path fullpath configname) (define (find-config configname #!key (toppath #f)) (if toppath (let ((cfname (conc toppath "/" configname))) - (if (file-exists? cfname) + (if (common:file-exists? cfname) (list toppath cfname configname) (list #f #f #f))) (let* ((cwd (string-split (current-directory) "/"))) (let loop ((dir cwd)) (let* ((path (conc "/" (string-intersperse dir "/"))) (fullpath (conc path "/" configname))) - (if (file-exists? fullpath) + (if (common:file-exists? fullpath) (list path fullpath configname) (let ((remcwd (take dir (- (length dir) 1)))) (if (null? remcwd) (list #f #f #f) ;; #f #f) (loop remcwd))))))))) @@ -228,11 +228,11 @@ (define (read-config path ht allow-system #!key (environ-patt #f) (curr-section #f) (sections #f) (settings (make-hash-table)) (keep-filenames #f) (post-section-procs '()) (apply-wildcards #t)) (debug:print 9 *default-log-port* "START: " path) (if (and (not (port? path)) - (not (file-exists? path))) ;; for case where we are handed a port + (not (common:file-exists? path))) ;; for case where we are handed a port (begin (debug:print-info 1 *default-log-port* "read-config - file not found " path " current path: " (current-directory)) ;; WARNING: This is a risky change but really, we should not return an empty hash table if no file read? #f) ;; (if (not ht)(make-hash-table) ht)) (let ((inp (if (string? path) @@ -281,11 +281,11 @@ (common:nice-path (conc (if curr-conf-dir curr-conf-dir ".") "/" include-file))))) - (if (file-exists? full-conf) + (if (common:file-exists? full-conf) (begin ;; (push-directory conf-dir) (debug:print 9 *default-log-port* "Including: " full-conf) (read-config full-conf res allow-system environ-patt: environ-patt curr-section: curr-section-name sections: sections settings: settings keep-filenames: keep-filenames) ;; (pop-directory) @@ -297,11 +297,11 @@ (configf:script-rx ( x include-script params);; handle-exceptions ;; exn ;; (begin ;; (debug:print '(0 2 9) #f "INFO: include from script " include-script " failed.") ;; (loop (configf:read-line inp res (calc-allow-system allow-system curr-section-name sections) settings) curr-section-name #f #f)) - (if (and (file-exists? include-script)(file-execute-access? include-script)) + (if (and (common:file-exists? include-script)(file-execute-access? include-script)) (let* ((new-inp-port (open-input-pipe (conc include-script " " params)))) (debug:print '(2 9) *default-log-port* "Including from script output: " include-script) ;; (print "We got here, calling read-config next. Port is: " new-inp-port) (read-config new-inp-port res allow-system environ-patt: environ-patt curr-section: curr-section-name sections: sections settings: settings keep-filenames: keep-filenames) (close-input-port new-inp-port) @@ -510,11 +510,11 @@ (if (null? tal) newres (loop (car tal)(cdr tal) newres)))))) (define (configf:file->list fname) - (if (file-exists? fname) + (if (common:file-exists? fname) (let ((inp (open-input-file fname))) (let loop ((inl (read-line inp)) (res '())) (if (eof-object? inl) (begin @@ -614,11 +614,11 @@ ;; reads a refdb into an assoc array of assoc arrays ;; returns (list dat msg) (define (configf:read-refdb refdb-path) (let ((sheets-file (conc refdb-path "/sheet-names.cfg"))) - (if (not (file-exists? sheets-file)) + (if (not (common:file-exists? sheets-file)) (list #f (conc "ERROR: no refdb found at " refdb-path)) (if (not (file-read-access? sheets-file)) (list #f (conc "ERROR: refdb file not readable at " refdb-path)) (let* ((sheets (with-input-from-file sheets-file (lambda () @@ -686,36 +686,33 @@ #f (configf:alist->config (with-input-from-file fname read)))) (define (configf:write-alist cdat fname) - (if (common:faux-lock fname) - (let* ((dat (configf:config->alist cdat)) - (res - (begin - (with-output-to-file fname ;; first write out the file - (lambda () - (pp dat))) - - (if (common:file-exists? fname) ;; now verify it is readable - (if (configf:read-alist fname) - #t ;; data is good. - (begin - (handle-exceptions - exn - #f - (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") - (delete-file fname)) - #f)) - #f)))) - - (common:faux-unlock fname) - res) - (begin - (debug:print 0 *default-log-port* "WARNING: could not get faux-lock on " fname) - #f))) - + (if (not (common:faux-lock fname)) + (debug:print 0 *default-log-port* "INFO: Could not get lock on " fname)) + (let* ((dat (configf:config->alist cdat)) + (res + (begin + (with-output-to-file fname ;; first write out the file + (lambda () + (pp dat))) + + (if (common:file-exists? fname) ;; now verify it is readable + (if (configf:read-alist fname) + #t ;; data is good. + (begin + (handle-exceptions + exn + #f + (debug:print 0 *default-log-port* "WARNING: content " dat " for cache " fname " is not readable. Deleting generated file.") + (delete-file fname)) + #f)) + #f)))) + (common:faux-unlock fname) + res)) + ;; convert hierarchial list to ini format ;; (define (configf:config->ini data) (map (lambda (section) Index: dashboard-tests.scm ================================================================== --- dashboard-tests.scm +++ dashboard-tests.scm @@ -234,11 +234,11 @@ ;; if there is a submegatest create a button to launch dashboard in that area ;; (define (submegatest-panel dbstruct keydat testdat runname testconfig) (let* ((subarea (configf:lookup testconfig "setup" "submegatest")) - (area-exists (and subarea (file-exists? subarea)))) + (area-exists (and subarea (common:file-exists? subarea)))) ;; (debug:print-info 0 *default-log-port* "Megatest subarea=" subarea ", area-exists=" area-exists) (if subarea (iup:frame #:title "Megatest Run Info" ; #:expand "YES" (iup:button @@ -458,11 +458,11 @@ "/")) (item-path (db:test-get-item-path testdat)) ;; this next block was added to fix a bug where variables were ;; needed. Revisit this. (runconfig (let ((runconfigf (conc *toppath* "/runconfigs.config"))) ;; no rush but it would be good to convert this call to use runconfig:read - (if (file-exists? runconfigf) + (if (common:file-exists? runconfigf) (handle-exceptions exn #f ;; do nothing, just keep on trucking .... (setup-env-defaults runconfigf run-id (make-hash-table) keydat environ-patt: keystring)) (make-hash-table)))) @@ -472,18 +472,18 @@ (handle-exceptions exn ;; NOTE: I've no idea why this was written this way. Research, study and fix needed! (tests:get-testconfig (db:test-get-testname testdat) (db:test-get-item-path testdat) test-registry #f) (tests:get-testconfig (db:test-get-testname testdat) item-path test-registry #t)))) (viewlog (lambda (x) - (if (file-exists? logfile) + (if (common:file-exists? logfile) ;(system (conc "firefox " logfile "&")) (dcommon:run-html-viewer logfile) (message-window (conc "File " logfile " not found"))))) (view-a-log (lambda (lfile) (let ((lfilename (conc rundir "/" lfile))) ;; (print "lfilename: " lfilename) - (if (file-exists? lfilename) + (if (common:file-exists? lfilename) ;(system (conc "firefox " logfile "&")) (dcommon:run-html-viewer lfilename) (message-window (conc "File " lfilename " not found")))))) (xterm (lambda (x) (if (directory-exists? rundir) @@ -496,11 +496,11 @@ "MT_.*")) (message-window (conc "Directory " rundir " not found"))))) (widgets (make-hash-table)) (refreshdat (lambda () (let* ((curr-mod-time (file-modification-time db-path)) - ;; (max ..... (if (file-exists? testdat-path) + ;; (max ..... (if (common:file-exists? testdat-path) ;; (file-modification-time testdat-path) ;; (begin ;; (set! testdat-path (conc rundir "/testdat.db")) ;; 0)))) (need-update (or (and (>= curr-mod-time db-mod-time) Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -1922,11 +1922,11 @@ (let* ((success #t) ;; at any stage of the process set this flag to #f to skip downstream steps. Intention here is to recover gracefully if user provided tabs fail to load. (source (configf:lookup views-cfgdat view-name "source")) (viewgen (configf:lookup views-cfgdat view-name "viewgen")) (updater (configf:lookup views-cfgdat view-name "updater")) (result-child #f)) - (if (and (file-exists? source) + (if (and (common:file-exists? source) (file-read-access? source)) (handle-exceptions exn (begin (print-call-chain) @@ -2829,11 +2829,11 @@ (glob (conc dbdir "/*.db*")))))) (define (dashboard:monitor-changed? commondat tabdat) (let* ((run-update-time (current-seconds)) (monitor-db-path (dboard:tabdat-monitor-db-path tabdat)) - (monitor-modtime (if (and monitor-db-path (file-exists? monitor-db-path)) + (monitor-modtime (if (and monitor-db-path (common:file-exists? monitor-db-path)) (file-modification-time monitor-db-path) -1))) (if (and (eq? (dboard:commondat-curr-tab-num commondat) 0) (or (> monitor-modtime *last-monitor-update-time*) (> (- run-update-time *last-monitor-update-time*) 5))) ;; update every 1/2 minute just in case @@ -3546,11 +3546,11 @@ ;; The heavy lifting starts here ;;====================================================================== (define (main) (let ((mtdb-path (conc *toppath* "/megatest.db"))) ;; - (if (and (file-exists? mtdb-path) + (if (and (common:file-exists? mtdb-path) (file-write-access? mtdb-path)) (if (not (args:get-arg "-skip-version-check")) (common:exit-on-version-changed))) (let* ((commondat (dboard:commondat-make))) ;; Move this stuff to db.scm? I'm not sure that is the right thing to do... @@ -3605,12 +3605,12 @@ (thread-start! th2) (thread-join! th2))))) ;; ease debugging by loading ~/.dashboardrc (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.dashboardrc"))) - (if (file-exists? debugcontrolf) + (if (common:file-exists? debugcontrolf) (load debugcontrolf))) (if (args:get-arg "-repl") (repl) (main)) Index: datashare.scm ================================================================== --- datashare.scm +++ datashare.scm @@ -226,11 +226,11 @@ (if (and path (directory? path) (file-read-access? path)) (let* ((dbpath (conc path "/datashare.db")) (writeable (file-write-access? dbpath)) - (dbexists (file-exists? dbpath)) + (dbexists (common:file-exists? dbpath)) (handler (make-busy-timeout 136000))) (handle-exceptions exn (begin (debug:print 2 *default-log-port* "ERROR: problem accessing db " dbpath @@ -413,11 +413,11 @@ paths)) ;; remove existing link and if possible ... ;; create path to next of tip of target, create link back to source (define (datashare:build-dir-make-link source target) - (if (file-exists? target)(datashare:backup-move target)) + (if (common:file-exists? target)(datashare:backup-move target)) (create-directory (pathname-directory target) #t) (create-symbolic-link source target)) (define (datashare:backup-move path) (let* ((trashdir (conc (pathname-directory path) "/.trash")) @@ -518,11 +518,11 @@ (define (datashare:path->lst path) (string-split path "/")) (define (datashare:pathdat-apply-heuristics configdat path) (cond - ((file-exists? path) "found") + ((common:file-exists? path) "found") (else (conc path " not installed")))) (define (datashare:get-view configdat) (iup:vbox (iup:hbox @@ -692,11 +692,11 @@ (define (datashare:find name paths) (if (null? paths) #f (let loop ((hed (car paths)) (tal (cdr paths))) - (if (file-exists? (conc hed "/" name)) + (if (common:file-exists? (conc hed "/" name)) hed (if (null? tal) #f (loop (car tal)(cdr tal))))))) @@ -706,11 +706,11 @@ (define (datashare:load-config exe-dir exe-name) (let* ((fname (conc exe-dir "/." exe-name ".config"))) (ini:property-separator-patt " * *") (ini:property-separator #\space) - (if (file-exists? fname) + (if (common:file-exists? fname) ;; (ini:read-ini fname) (read-config fname #f #t) (make-hash-table)))) (define (datashare:process-action configdat action . args) @@ -785,11 +785,11 @@ versions) (sqlite3:finalize! db))))) ;; ease debugging by loading ~/.dashboardrc - REMOVE FROM PRODUCTION! (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.datasharerc"))) - (if (file-exists? debugcontrolf) + (if (common:file-exists? debugcontrolf) (load debugcontrolf))) (define (main) (let* ((args (argv)) (prog (car args)) Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -204,20 +204,20 @@ (define (db:lock-create-open fname initproc) (let* ((parent-dir (or (pathname-directory fname)(current-directory))) ;; no parent? go local (raw-fname (pathname-file fname)) (dir-writable (file-write-access? parent-dir)) - (file-exists (file-exists? fname)) + (file-exists (common:file-exists? fname)) (file-write (if file-exists (file-write-access? fname) dir-writable ))) ;;(mutex-lock! *db-open-mutex*) ;; tried this mutex, not clear it helped. (if file-write ;; dir-writable (condition-case (let* ((lockfname (conc fname ".lock")) (readyfname (conc parent-dir "/.ready-" raw-fname)) - (readyexists (file-exists? readyfname))) + (readyexists (common:file-exists? readyfname))) (if (not readyexists) (common:simple-file-lock-and-wait lockfname)) (let ((db (sqlite3:open-database fname))) (sqlite3:set-busy-handler! db (make-busy-timeout 136000)) (sqlite3:execute db "PRAGMA synchronous = 0;") @@ -264,11 +264,11 @@ ;; ;; This routine creates the db. It is only called if the db is not already opened ;; ;; ;; (define (db:open-rundb dbstruct run-id #!key (attemptnum 0)(do-not-open #f)) ;; (conc *toppath* "/megatest.db") (car *configinfo*))) ;; (let* ((dbfile (db:dbfile-path run-id)) ;; (conc toppath "/db/" run-id ".db")) -;; (dbexists (file-exists? dbfile)) +;; (dbexists (common:file-exists? dbfile)) ;; (db (db:lock-create-open dbfile (lambda (db) ;; (handle-exceptions ;; exn ;; (begin ;; ;; (release-dot-lock dbpath) @@ -304,14 +304,14 @@ (define (db:open-db dbstruct #!key (areapath #f)(do-sync #t)) ;; TODO: actually use areapath (let ((tmpdb-stack (dbr:dbstruct-dbstack dbstruct))) ;; RA => Returns the first reference in dbstruct (if (stack? tmpdb-stack) (db:get-db tmpdb-stack) ;; get previously opened db (will create new db handle if all in the stack are already used (let* ((dbpath (db:dbfile-path )) ;; path to tmp db area - (dbexists (file-exists? dbpath)) + (dbexists (common:file-exists? dbpath)) (tmpdbfname (conc dbpath "/megatest.db")) - (dbfexists (file-exists? tmpdbfname)) ;; (conc dbpath "/megatest.db"))) - (mtdbexists (file-exists? (conc *toppath* "/megatest.db"))) + (dbfexists (common:file-exists? tmpdbfname)) ;; (conc dbpath "/megatest.db"))) + (mtdbexists (common:file-exists? (conc *toppath* "/megatest.db"))) (mtdb (db:open-megatest-db)) (mtdbpath (db:dbdat-get-path mtdb)) (tmpdb (db:open-megatest-db path: dbpath)) ;; lock-create-open dbpath db:initialize-main-db)) (refndb (db:open-megatest-db path: dbpath name: "megatest_ref.db")) @@ -373,11 +373,11 @@ ;; NOTE: returns a dbdat not a dbstruct! ;; (define (db:open-megatest-db #!key (path #f)(name #f)) (let* ((dbdir (or path *toppath*)) (dbpath (conc dbdir "/" (or name "megatest.db"))) - (dbexists (file-exists? dbpath)) + (dbexists (common:file-exists? dbpath)) (db (db:lock-create-open dbpath (lambda (db) (db:initialize-main-db db) ;;(db:initialize-run-id-db db) ))) @@ -402,10 +402,25 @@ (mutex-lock! *db-multi-sync-mutex*) (set! *db-last-sync* start-t) (set! *db-last-access* start-t) (mutex-unlock! *db-multi-sync-mutex*) (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))) + +(define (db:safely-close-sqlite3-db db #!key (try-num 3)) + (if (<= try-num 0) + #f + (handle-exceptions + exn + (begin + (thread-sleep! 3) + (sqlite3:interrupt! db) + (db:safely-close-sqlite3-db db try-num: (- try-num 1))) + (if (sqlite3:database? db) + (begin + (sqlite3:finalize! db) + #t) + #f)))) ;; close all opened run-id dbs (define (db:close-all dbstruct) (if (dbr:dbstruct? dbstruct) (handle-exceptions @@ -417,15 +432,16 @@ (let ((tdbs (map db:dbdat-get-db (stack->list (dbr:dbstruct-dbstack dbstruct)))) (mdb (db:dbdat-get-db (dbr:dbstruct-mtdb dbstruct))) (rdb (db:dbdat-get-db (dbr:dbstruct-refndb dbstruct)))) (map (lambda (db) - (if (sqlite3:database? db) - (sqlite3:finalize! db))) + (db:safely-close-sqlite3-db db)) +;; (if (sqlite3:database? db) +;; (sqlite3:finalize! db))) tdbs) - (if (sqlite3:database? mdb) (sqlite3:finalize! mdb)) - (if (sqlite3:database? rdb) (sqlite3:finalize! rdb)))))) + (db:safely-close-sqlite3-db mdb) ;; (if (sqlite3:database? mdb) (sqlite3:finalize! mdb)) + (db:safely-close-sqlite3-db rdb))))) ;; (if (sqlite3:database? rdb) (sqlite3:finalize! rdb)))))) ;; (let ((locdbs (dbr:dbstruct-locdbs dbstruct))) ;; (if (hash-table? locdbs) ;; (for-each (lambda (run-id) ;; (db:close-run-db dbstruct run-id)) @@ -529,11 +545,11 @@ (tmpname (conc fname "." (current-process-id))) (tmpjnl (conc fnamejnl "." (current-process-id)))) (debug:print-error 0 *default-log-port* "" fname " appears corrupted. Making backup \"old/" fname "\"") (system (conc "cd " dbdir ";mkdir -p old;cat " fname " > old/" tmpname)) (system (conc "rm -f " dbpath)) - (if (file-exists? fnamejnl) + (if (common:file-exists? fnamejnl) (begin (debug:print-error 0 *default-log-port* "" fnamejnl " found, moving it to old dir as " tmpjnl) (system (conc "cd " dbdir ";mkdir -p old;cat " fnamejnl " > old/" tmpjnl)) (system (conc "rm -f " dbdir "/" fnamejnl)))) ;; attempt to recreate database @@ -653,26 +669,42 @@ (numrecs (make-hash-table)) (start-time (current-milliseconds)) (tot-count 0)) (for-each ;; table (lambda (tabledat) - (let* ((tablename (car tabledat)) - (fields (cdr tabledat)) - (use-last-update (if last-update - (if (pair? last-update) - (member (car last-update) ;; last-update field name - (map car fields)) - (begin - (debug:print 0 *default-log-port* "ERROR: parameter last-update for db:sync-tables must be a pair, received: " last-update) ;; found in fields - #f)) - #f)) + (let* ((tablename (car tabledat)) + (fields (cdr tabledat)) + (has-last-update (member "last_update" fields)) + (use-last-update (cond + ((and has-last-update + (member "last_update" fields)) + #t) ;; if given a number, just use it for all fields + ((number? last-update) #f) ;; if not matched first entry then ignore last-update for this table + ((and (pair? last-update) + (member (car last-update) ;; last-update field name + (map car fields))) #t) + (last-update + (debug:print 0 *default-log-port* "ERROR: parameter last-update for db:sync-tables must be a pair or a number, received: " last-update) ;; found in fields + #f) + (else + #f))) + (last-update-value (if use-last-update ;; no need to check for has-last-update - it is already accounted for + (if (number? last-update) + last-update + (cdr last-update)) + #f)) + (last-update-field (if use-last-update + (if (number? last-update) + "last_update" + (car last-update)) + #f)) (num-fields (length fields)) (field->num (make-hash-table)) (num->field (apply vector (map car fields))) (full-sel (conc "SELECT " (string-intersperse (map car fields) ",") " FROM " tablename (if use-last-update ;; apply last-update criteria - (conc " " (car last-update) ">=" (cdr last-update)) + (conc " WHERE " last-update-field " >= " last-update-value) "") ";")) (full-ins (conc "INSERT OR REPLACE INTO " tablename " ( " (string-intersperse (map car fields) ",") " ) " " VALUES ( " (string-intersperse (make-list num-fields "?") ",") " );")) (fromdat '()) @@ -879,11 +911,11 @@ (define (db:cache-for-read-only source target #!key (use-last-update #f)) (if (and (hash-table-ref/default *global-db-store* target #f) (>= (file-modification-time target)(file-modification-time source))) (hash-table-ref *global-db-store* target) (let* ((toppath (launch:setup)) - (targ-db-last-mod (if (file-exists? target) + (targ-db-last-mod (if (common:file-exists? target) (file-modification-time target) 0)) (cache-db (or (hash-table-ref/default *global-db-store* target #f) (db:open-megatest-db path: target))) (source-db (db:open-megatest-db path: source)) @@ -893,41 +925,41 @@ (db:sync-tables (db:sync-main-list source-db) last-update source-db cache-db) (db:sync-tables db:sync-tests-only last-update source-db cache-db) (hash-table-set! *global-db-store* target cache-db) cache-db))) -;; call a proc with a cached db -;; -(define (db:call-with-cached-db proc . params) - ;; first cache the db in /tmp - (let* ((cname-part (conc "megatest_cache/" (common:get-testsuite-name))) - (fname (conc (common:get-area-path-signature) ".db")) - (cache-dir (common:get-create-writeable-dir - (list (conc "/tmp/" (current-user-name) "/" cname-part) - (conc "/tmp/" (current-user-name) "-" cname-part) - (conc "/tmp/" (current-user-name) "_" cname-part)))) - (megatest-db (conc *toppath* "/megatest.db"))) - ;; (debug:print-info 0 *default-log-port* "Using cache dir " cache-dir) - (if (not cache-dir) - (begin - (debug:print 0 *default-log-port* "ERROR: Failed to find an area to write the cache db") - (exit 1)) - (let* ((th1 (make-thread - (lambda () - (if (and (file-exists? megatest-db) - (file-write-access? megatest-db)) - (begin - (common:sync-to-megatest.db 'timestamps) ;; internally mutexes on *db-local-sync* - (debug:print-info 2 *default-log-port* "Done syncing to megatest.db")))) - "call-with-cached-db sync-to-megatest.db")) - (cache-db (db:cache-for-read-only - megatest-db - (conc cache-dir "/" fname) - use-last-update: #t))) - (thread-start! th1) - (apply proc cache-db params) - )))) +;; ;; call a proc with a cached db +;; ;; +;; (define (db:call-with-cached-db proc . params) +;; ;; first cache the db in /tmp +;; (let* ((cname-part (conc "megatest_cache/" (common:get-testsuite-name))) +;; (fname (conc (common:get-area-path-signature) ".db")) +;; (cache-dir (common:get-create-writeable-dir +;; (list (conc "/tmp/" (current-user-name) "/" cname-part) +;; (conc "/tmp/" (current-user-name) "-" cname-part) +;; (conc "/tmp/" (current-user-name) "_" cname-part)))) +;; (megatest-db (conc *toppath* "/megatest.db"))) +;; ;; (debug:print-info 0 *default-log-port* "Using cache dir " cache-dir) +;; (if (not cache-dir) +;; (begin +;; (debug:print 0 *default-log-port* "ERROR: Failed to find an area to write the cache db") +;; (exit 1)) +;; (let* ((th1 (make-thread +;; (lambda () +;; (if (and (common:file-exists? megatest-db) +;; (file-write-access? megatest-db)) +;; (begin +;; (db:sync-to-megatest.db dbstruct 'timestamps) ;; internally mutexes on *db-local-sync* +;; (debug:print-info 2 *default-log-port* "Done syncing to megatest.db")))) +;; "call-with-cached-db sync-to-megatest.db")) +;; (cache-db (db:cache-for-read-only +;; megatest-db +;; (conc cache-dir "/" fname) +;; use-last-update: #t))) +;; (thread-start! th1) +;; (apply proc cache-db params) +;; )))) ;; options: ;; ;; 'killservers - kills all servers ;; 'dejunk - removes junk records @@ -937,71 +969,100 @@ ;; 'closeall - close all opened dbs ;; 'schema - attempt to apply schema changes ;; run-ids: '(1 2 3 ...) or #f (for all) ;; (define (db:multi-db-sync dbstruct . options) - (if (not (launch:setup)) - (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") - (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) - (tmpdb (db:get-db dbstruct)) - (refndb (dbr:dbstruct-refndb dbstruct)) - (allow-cleanup #t) ;; (if run-ids #f #t)) - (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) - (data-synced 0)) ;; count of changed records (I hope) - - (for-each - (lambda (option) - - (case option - ;; kill servers - ((killservers) - (for-each - (lambda (server) - (match-let (((mod-time host port start-time pid) server)) - (if (and host pid) - (tasks:kill-server host pid)))) - servers)) - - ;; clear out junk records - ;; - ((dejunk) - (db:delay-if-busy mtdb) ;; ok to delay on mtdb - (db:clean-up mtdb) - (db:clean-up tmpdb) - (db:clean-up refndb)) - - ;; sync runs, test_meta etc. - ;; - ((old2new) - (set! data-synced - (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb) - data-synced))) - - ;; now ensure all newdb data are synced to megatest.db - ;; do not use the run-ids list passed in to the function - ;; - ((new2old) - (set! data-synced - (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb) - data-synced))) - - ((adj-target) - (db:adj-target (db:dbdat-get-db mtdb)) - (db:adj-target (db:dbdat-get-db tmpdb)) - (db:adj-target (db:dbdat-get-db refndb))) - - ((schema) - (db:patch-schema-maindb (db:dbdat-get-db mtdb)) - (db:patch-schema-maindb (db:dbdat-get-db tmpdb)) - (db:patch-schema-maindb (db:dbdat-get-db refndb)) - (db:patch-schema-rundb (db:dbdat-get-db mtdb)) - (db:patch-schema-rundb (db:dbdat-get-db tmpdb)) - (db:patch-schema-rundb (db:dbdat-get-db refndb)))) - - (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)) - options) - data-synced))) + ;; (if (not (launch:setup)) + ;; (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.") + (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) + (tmpdb (db:get-db dbstruct)) + (refndb (dbr:dbstruct-refndb dbstruct)) + (allow-cleanup #t) ;; (if run-ids #f #t)) + (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat))) + (data-synced 0)) ;; count of changed records (I hope) + + (for-each + (lambda (option) + + (case option + ;; kill servers + ((killservers) + (for-each + (lambda (server) + (match-let (((mod-time host port start-time pid) server)) + (if (and host pid) + (tasks:kill-server host pid)))) + servers)) + + ;; clear out junk records + ;; + ((dejunk) + (db:delay-if-busy mtdb) ;; ok to delay on mtdb + (db:clean-up mtdb) + (db:clean-up tmpdb) + (db:clean-up refndb)) + + ;; sync runs, test_meta etc. + ;; + ((old2new) + (set! data-synced + (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb) + data-synced))) + + ;; now ensure all newdb data are synced to megatest.db + ;; do not use the run-ids list passed in to the function + ;; + ((new2old) + (set! data-synced + (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb) + data-synced))) + + ((adj-target) + (db:adj-target (db:dbdat-get-db mtdb)) + (db:adj-target (db:dbdat-get-db tmpdb)) + (db:adj-target (db:dbdat-get-db refndb))) + + ((schema) + (db:patch-schema-maindb (db:dbdat-get-db mtdb)) + (db:patch-schema-maindb (db:dbdat-get-db tmpdb)) + (db:patch-schema-maindb (db:dbdat-get-db refndb)) + (db:patch-schema-rundb (db:dbdat-get-db mtdb)) + (db:patch-schema-rundb (db:dbdat-get-db tmpdb)) + (db:patch-schema-rundb (db:dbdat-get-db refndb)))) + + (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb)) + options) + data-synced)) + +(define (db:tmp->megatest.db-sync dbstruct last-update) + (let* ((mtdb (dbr:dbstruct-mtdb dbstruct)) + (tmpdb (db:get-db dbstruct)) + (refndb (dbr:dbstruct-refndb dbstruct))) + (db:sync-tables (db:sync-all-tables-list dbstruct) last-update tmpdb refndb mtdb))) + +;;;; run-ids +;; if #f use *db-local-sync* : or 'local-sync-flags +;; if #t use timestamps : or 'timestamps +(define (db:sync-to-megatest.db dbstruct #!key (no-sync-db #f)) + (let* ((start-time (current-seconds)) + (last-update (if no-sync-db + (db:no-sync-get/default no-sync-db "LAST_UPDATE" 0) + 0)) ;; (or (db:get-var dbstruct "LAST_UPDATE") 0)) + (sync-needed (> (- start-time last-update) 6)) + (res (if sync-needed ;; don't sync if a sync already occurred in the past 6 seconds + (begin + (if no-sync-db + (db:no-sync-set no-sync-db "LAST_UPDATE" start-time)) + (db:tmp->megatest.db-sync dbstruct last-update)) + 0)) + (sync-time (- (current-seconds) start-time))) + (debug:print-info 3 *default-log-port* "Sync of newdb to olddb completed in " sync-time " seconds pid="(current-process-id)) + (if (common:low-noise-print 30 "sync new to old") + (if sync-needed + (debug:print-info 0 *default-log-port* "Sync of " res " records from newdb to olddb completed in " sync-time " seconds pid="(current-process-id)) + (debug:print-info 0 *default-log-port* "No sync needed, last updated " (- start-time last-update) " seconds ago"))) + res)) ;; keeping it around for debugging purposes only (define (open-run-close-no-exception-handling proc idb . params) (debug:print-info 11 *default-log-port* "open-run-close-no-exception-handling START given a db=" (if idb "yes " "no ") ", params=" params) (print "I don't work anymore. open-run-close-no-exception-handling needs fixing or removing...") @@ -1394,11 +1455,11 @@ ;; L O G G I N G D B ;;====================================================================== (define (open-logging-db) (let* ((dbpath (conc (if *toppath* (conc *toppath* "/") "") "logging.db")) ;; fname) - (dbexists (file-exists? dbpath)) + (dbexists (common:file-exists? dbpath)) (db (sqlite3:open-database dbpath)) (handler (make-busy-timeout (if (args:get-arg "-override-timeout") (string->number (args:get-arg "-override-timeout")) 136000)))) ;; 136000))) (sqlite3:set-busy-handler! db handler) @@ -1550,11 +1611,11 @@ ;; ;; (db:delay-if-busy dbdat) (let* (;; (min-incompleted (filter (lambda (x) ;; (let* ((testpath (cadr x)) ;; (tdatpath (conc testpath "/testdat.db")) - ;; (dbexists (file-exists? tdatpath))) + ;; (dbexists (common:file-exists? tdatpath))) ;; (or (not dbexists) ;; if no file then something wrong - mark as incomplete ;; (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim ;; incompleted)) (min-incompleted-ids (map car incompleted)) ;; do 'em all (all-ids (append min-incompleted-ids (map car oldlaunched)))) @@ -1768,10 +1829,60 @@ (define (db:del-var dbstruct var) (db:with-db dbstruct #f #t (lambda (db) (sqlite3:execute db "DELETE FROM metadat WHERE var=?;" var)))) + +;;====================================================================== +;; no-sync.db - small bits of data to be shared between servers +;;====================================================================== + +(define (db:open-no-sync-db) + (let* ((dbpath (db:dbfile-path)) + (dbname (conc dbpath "/no-sync.db")) + (db (sqlite3:open-database dbname))) + (sqlite3:set-busy-handler! db (make-busy-timeout 136000)) + (sqlite3:execute db "PRAGMA synchronous = 0;") + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));") + db)) + +;; if we are not a server create a db handle. this is not finalized +;; so watch for problems. I'm still not clear if it is needed to manually +;; finalize sqlite3 dbs with the sqlite3 egg. +;; +(define (db:no-sync-db db-in) + (if db-in + db-in + (let ((db (db:open-no-sync-db))) + (set! *no-sync-db* db) + db))) + +(define (db:no-sync-set db var val) + (sqlite3:execute (db:no-sync-db db) "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val)) + +(define (db:no-sync-del! db var) + (sqlite3:execute (db:no-sync-db db) "DELETE FROM no_sync_metadat WHERE var=?;" var)) + +(define (db:no-sync-get/default db var default) + (let ((res default)) + (sqlite3:for-each-row + (lambda (val) + (set! res val)) + (db:no-sync-db db) + "SELECT val FROM no_sync_metadat WHERE var=?;" + var) + (if res + (let ((newres (if (string? res) + (string->number res) + #f))) + (if newres + newres + res)) + res))) + +(define (db:no-sync-close-db db) + (db:safely-close-sqlite3-db db)) ;; use a global for some primitive caching, it is just silly to ;; re-read the db over and over again for the keys since they never ;; change @@ -1794,14 +1905,19 @@ ;; look up values in a header/data structure (define (db:get-value-by-header row header field) (if (or (null? header) (not row)) #f (let loop ((hed (car header)) - (tal (cdr header)) - (n 0)) - (if (equal? hed field) - (vector-ref row n) + (tal (cdr header)) + (n 0)) + (if (equal? hed field) + (handle-exceptions + exn + (begin + (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row=" row " header=" header " field=" field) + #f) + (vector-ref row n)) (if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1))))))) ;; Accessors for the header/data structure ;; get rows and header from (define (db:get-header vec)(vector-ref vec 0)) @@ -3023,12 +3139,12 @@ (configf:lookup dat entry-name "message") ;; 4 ;; Comment (configf:lookup dat entry-name "exit-status") ;; 5 ;; Status "logpro" ;; 6 ;; Type )))) (let* ((value (or (configf:lookup dat entry-name "measured") "n/a")) - (expected (or (configf:lookup dat entry-name "expected") "n/a")) - (tolerance (or (configf:lookup dat entry-name "tolerance") "n/a")) + (expected (or (configf:lookup dat entry-name "expected") 0.0)) + (tolerance (or (configf:lookup dat entry-name "tolerance") 0.0)) (comment (or (configf:lookup dat entry-name "comment") (configf:lookup dat entry-name "desc") "n/a")) (status (or (configf:lookup dat entry-name "status") "n/a")) (type (or (configf:lookup dat entry-name "expected") "n/a"))) (set! res (append @@ -3129,10 +3245,25 @@ (lambda (id test_id category variable value expected tol units comment status type) (set! res (cons (vector id test_id category variable value expected tol units comment status type) res))) db "SELECT id,test_id,category,variable,value,expected,tol,units,comment,status,type FROM test_data WHERE test_id=? AND category LIKE ? ORDER BY category,variable;" test-id categorypatt) (reverse res))))) + +;; This routine moved from tdb.scm, tdb:read-test-data +;; +(define (db:read-test-data* dbstruct run-id test-id categorypatt varpatt) + (let* ((res '())) + (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:for-each-row + (lambda (id test_id category variable value expected tol units comment status type) + (set! res (cons (vector id test_id category variable value expected tol units comment status type) res))) + db + "SELECT id,test_id,category,variable,value,expected,tol,units,comment,status,type FROM test_data WHERE test_id=? AND category LIKE ? AND variable LIKE ? ORDER BY category,variable;" test-id categorypatt varpatt) + (reverse res))))) + ;;====================================================================== ;; Misc. test related queries ;;====================================================================== @@ -3259,11 +3390,13 @@ (test-name (if (number? test-name) (db:test-get-testname testdat) test-name)) (item-path (db:test-get-item-path testdat)) (tl-testdat (db:get-test-info dbstruct run-id test-name "")) - (tl-test-id (db:test-get-id tl-testdat))) + (tl-test-id (if tl-testdat + (db:test-get-id tl-testdat) + #f))) (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) (db:general-call dbstruct 'set-test-start-time (list test-id))) (mutex-lock! *db-transaction-mutex*) (db:with-db dbstruct #f #f @@ -3325,11 +3458,12 @@ "STARTED" (car all-curr-statuses)))) ;; (print "bad-not-supported: " bad-not-support " all-curr-states: " all-curr-states " all-curr-statuses: " all-curr-states) ;; " newstate: " newstate " newstatus: " newstatus) ;; NB// Pass the db so it is part of the transaction - (db:test-set-state-status db run-id tl-test-id newstate newstatus #f))))))) + (if tl-test-id + (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)))))))) (mutex-unlock! *db-transaction-mutex*) (if (and test-id state status (equal? status "AUTO")) (db:test-data-rollup dbstruct run-id test-id status)) tr-res))))) ;; BBnote: db:get-all-state-status-counts-for-test returns dbr:counts object aggregating state and status of items of a given test, *not including rollup state/status* @@ -3680,11 +3814,11 @@ exn (begin (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj) (thread-sleep! 1) (db:delay-if-busy count (- count 1))) - (file-exists? dbfj)) + (common:file-exists? dbfj)) (case count ((6) (thread-sleep! 0.2) (db:delay-if-busy count: 5)) ((5) @@ -4045,12 +4179,12 @@ (testname (vector-ref vb (+ 2 numkeys))) (item-path (vector-ref vb (+ 3 numkeys))) (final-log (vector-ref vb (+ 7 numkeys))) (run-dir (vector-ref vb (+ 18 numkeys))) (log-fpath (conc run-dir "/" final-log))) ;; (string-intersperse keyvals "/") "/" testname "/" item-path "/" - (debug:print 4 *default-log-port* "log: " log-fpath " exists: " (file-exists? log-fpath)) - (vector-set! vb (+ 7 numkeys) (if (file-exists? log-fpath) + (debug:print 4 *default-log-port* "log: " log-fpath " exists: " (common:file-exists? log-fpath)) + (vector-set! vb (+ 7 numkeys) (if (common:file-exists? log-fpath) (let ((newpath (conc pathmod "/" (string-intersperse keyvals "/") "/" runname "/" testname "/" (if (string=? item-path "") "" (conc "/" item-path)) final-log))) Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -1510,10 +1510,19 @@
In megatest.config
[setup]
 reruns 5
+

Replace the default blacklisted environment variables with user supplied +list.

+

Default list: USER HOME DISPLAY LS_COLORS XKEYSYMDB EDITOR MAKEFLAGS MAKEF MAKEOVERRIDES

+
Add a "bad" variable "PROMPT" to the variables that will be commented out

in the megatest.sh and megatest.csh files:

+
+
+
[setup]
+blacklistvars USER HOME DISPLAY LS_COLORS XKEYSYMDB EDITOR MAKEFLAGS PROMPT
+
Run time limit
@@ -1913,11 +1922,11 @@
$MT_MEGATEST -env2file .ezsteps/${stepname}

Triggers

-

In your testconfig triggers can be specified

+

In your testconfig or megatest.config triggers can be specified

[triggers]
 
 # Call script running.sh when test goes to state=RUNNING, status=PASS
@@ -1927,11 +1936,11 @@
 RUNNING/ running.sh
 
 # Call script onpass.sh any time status goes to PASS
 PASS/ onpass.sh
-

Scripts called will have; test-id test-rundir trigger, added to the commandline.

+

Scripts called will have; test-id test-rundir trigger test-name item-path state status event-time, added to the commandline.

HINT

To start an xterm (useful for debugging), use a command line like the following:

[triggers]
@@ -2151,10 +2160,10 @@
 

Index: docs/manual/reference.txt ================================================================== --- docs/manual/reference.txt +++ docs/manual/reference.txt @@ -146,10 +146,22 @@ ------------------ [setup] reruns 5 ------------------ +Replace the default blacklisted environment variables with user supplied +list. + +Default list: USER HOME DISPLAY LS_COLORS XKEYSYMDB EDITOR MAKEFLAGS MAKEF MAKEOVERRIDES + +.Add a "bad" variable "PROMPT" to the variables that will be commented out +in the megatest.sh and megatest.csh files: +----------------- +[setup] +blacklistvars USER HOME DISPLAY LS_COLORS XKEYSYMDB EDITOR MAKEFLAGS PROMPT +----------------- + Run time limit ++++++++++++++ ----------------- [setup] @@ -470,11 +482,11 @@ ---------------------------- Triggers ~~~~~~~~ -In your testconfig triggers can be specified +In your testconfig or megatest.config triggers can be specified ----------------- [triggers] # Call script running.sh when test goes to state=RUNNING, status=PASS @@ -485,11 +497,11 @@ # Call script onpass.sh any time status goes to PASS PASS/ onpass.sh ----------------- -Scripts called will have; test-id test-rundir trigger, added to the commandline. +Scripts called will have; test-id test-rundir trigger test-name item-path state status event-time, added to the commandline. HINT To start an xterm (useful for debugging), use a command line like the following: ADDED emergency-patch-1.scm Index: emergency-patch-1.scm ================================================================== --- /dev/null +++ emergency-patch-1.scm @@ -0,0 +1,203 @@ + + +;; These are called by the server on recipt of /api calls +;; - keep it simple, only return the actual result of the call, i.e. no meta info here +;; +;; - returns #( flag result ) +;; +(define (api:execute-requests dbstruct dat) + (handle-exceptions + exn + (let ((call-chain (get-call-chain))) + (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an exception from peer, dat=" dat) + (print-call-chain (current-error-port)) + (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) + (vector #f (vector exn call-chain dat))) ;; return some stuff for debug if an exception happens + (cond + ((not (vector? dat)) ;; it is an error to not receive a vector + (vector #f (vector #f "remote must be called with a vector"))) + ((> *api-process-request-count* 20) ;; 20) + (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") + (set! *server-overloaded* #t) + (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! + (else + (let* ((cmd-in (vector-ref dat 0)) + (cmd (if (symbol? cmd-in) + cmd-in + (string->symbol cmd-in))) + (params (vector-ref dat 1)) + (start-t (current-milliseconds)) + (readonly-mode (dbr:dbstruct-read-only dbstruct)) + (readonly-command (member cmd api:read-only-queries)) + (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) + (res + (if writecmd-in-readonly-mode + (conc "attempt to run write command "cmd" on a read-only database") + (case cmd + ;;=============================================== + ;; READ/WRITE QUERIES + ;;=============================================== + + ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl + + ;; SERVERS + ((start-server) (apply server:kind-run params)) + ((kill-server) (set! *server-run* #f)) + + ;; TESTS + + ;;((test-set-state-status-by-id) (apply mt:test-set-state-status-by-id dbstruct params)) + ;;BB - commented out above because it was calling below, eventually, incorrectly (dbstruct passed to mt:test-set-state-status-by-id, which previosly did more, but now only passes thru to db:set-state-status-and-roll-up-items. + ((test-set-state-status-by-id) + + ;; (define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment) + (db:set-state-status-and-roll-up-items + dbstruct + (list-ref params 0) ; run-id + (list-ref params 1) ; test-name + #f ; item-path + (list-ref params 2) ; state + (list-ref params 3) ; status + (list-ref params 4) ; comment + )) + + ((delete-test-records) (apply db:delete-test-records dbstruct params)) + ((delete-old-deleted-test-records) (apply db:delete-old-deleted-test-records dbstruct params)) + ((test-set-state-status) (apply db:test-set-state-status dbstruct params)) + ((test-set-top-process-pid) (apply db:test-set-top-process-pid dbstruct params)) + ((set-state-status-and-roll-up-items) (apply db:set-state-status-and-roll-up-items dbstruct params)) + ((top-test-set-per-pf-counts) (apply db:top-test-set-per-pf-counts dbstruct params)) + ((test-set-archive-block-id) (apply db:test-set-archive-block-id dbstruct params)) + + ;; RUNS + ((register-run) (apply db:register-run dbstruct params)) + ((set-tests-state-status) (apply db:set-tests-state-status dbstruct params)) + ((delete-run) (apply db:delete-run dbstruct params)) + ((lock/unlock-run) (apply db:lock/unlock-run dbstruct params)) + ((update-run-event_time) (apply db:update-run-event_time dbstruct params)) + ((update-run-stats) (apply db:update-run-stats dbstruct params)) + ((set-var) (apply db:set-var dbstruct params)) + ((del-var) (apply db:del-var dbstruct params)) + + ;; STEPS + ((teststep-set-status!) (apply db:teststep-set-status! dbstruct params)) + + ;; TEST DATA + ((test-data-rollup) (apply db:test-data-rollup dbstruct params)) + ((csv->test-data) (apply db:csv->test-data dbstruct params)) + + ;; MISC + ((sync-inmem->db) (let ((run-id (car params))) + (db:sync-touched dbstruct run-id force-sync: #t))) + ((mark-incomplete) (apply db:find-and-mark-incomplete dbstruct params)) + + ;; TESTMETA + ((testmeta-add-record) (apply db:testmeta-add-record dbstruct params)) + ((testmeta-update-field) (apply db:testmeta-update-field dbstruct params)) + ((get-tests-tags) (db:get-tests-tags dbstruct)) + + ;; TASKS + ((tasks-add) (apply tasks:add dbstruct params)) + ((tasks-set-state-given-param-key) (apply tasks:set-state-given-param-key dbstruct params)) + ((tasks-get-last) (apply tasks:get-last dbstruct params)) + + ;; NO SYNC DB + ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) + ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) + ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) + + ;; ARCHIVES + ;; ((archive-get-allocations) + ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) + ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) + ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) + + ;;====================================================================== + ;; READ ONLY QUERIES + ;;====================================================================== + + ;; KEYS + ((get-key-val-pairs) (apply db:get-key-val-pairs dbstruct params)) + ((get-keys) (db:get-keys dbstruct)) + ((get-key-vals) (apply db:get-key-vals dbstruct params)) + ((get-target) (apply db:get-target dbstruct params)) + ((get-targets) (db:get-targets dbstruct)) + + ;; ARCHIVES + ((test-get-archive-block-info) (apply db:test-get-archive-block-info dbstruct params)) + + ;; TESTS + ((test-toplevel-num-items) (apply db:test-toplevel-num-items dbstruct params)) + ((get-test-info-by-id) (apply db:get-test-info-by-id dbstruct params)) + ((test-get-rundir-from-test-id) (apply db:test-get-rundir-from-test-id dbstruct params)) + ((get-count-tests-running-for-testname) (apply db:get-count-tests-running-for-testname dbstruct params)) + ((get-count-tests-running) (apply db:get-count-tests-running dbstruct params)) + ((get-count-tests-running-in-jobgroup) (apply db:get-count-tests-running-in-jobgroup dbstruct params)) + ;; ((delete-test-step-records) (apply db:delete-test-step-records dbstruct params)) + ((get-previous-test-run-record) (apply db:get-previous-test-run-record dbstruct params)) + ((get-matching-previous-test-run-records)(apply db:get-matching-previous-test-run-records dbstruct params)) + ((test-get-logfile-info) (apply db:test-get-logfile-info dbstruct params)) + ((test-get-records-for-index-file) (apply db:test-get-records-for-index-file dbstruct params)) + ((get-testinfo-state-status) (apply db:get-testinfo-state-status dbstruct params)) + ((test-get-top-process-pid) (apply db:test-get-top-process-pid dbstruct params)) + ((test-get-paths-matching-keynames-target-new) (apply db:test-get-paths-matching-keynames-target-new dbstruct params)) + ((get-prereqs-not-met) (apply db:get-prereqs-not-met dbstruct params)) + ((get-count-tests-running-for-run-id) (apply db:get-count-tests-running-for-run-id dbstruct params)) + ((synchash-get) (apply synchash:server-get dbstruct params)) + ((get-raw-run-stats) (apply db:get-raw-run-stats dbstruct params)) + + ;; RUNS + ((get-run-info) (apply db:get-run-info dbstruct params)) + ((get-run-status) (apply db:get-run-status dbstruct params)) + ((set-run-status) (apply db:set-run-status dbstruct params)) + ((get-tests-for-run) (apply db:get-tests-for-run dbstruct params)) + ((get-test-id) (apply db:get-test-id dbstruct params)) + ((get-tests-for-run-mindata) (apply db:get-tests-for-run-mindata dbstruct params)) + ((get-runs) (apply db:get-runs dbstruct params)) + ((get-num-runs) (apply db:get-num-runs dbstruct params)) + ((get-all-run-ids) (db:get-all-run-ids dbstruct)) + ((get-prev-run-ids) (apply db:get-prev-run-ids dbstruct params)) + ((get-run-ids-matching-target) (apply db:get-run-ids-matching-target dbstruct params)) + ((get-runs-by-patt) (apply db:get-runs-by-patt dbstruct params)) + ((get-run-name-from-id) (apply db:get-run-name-from-id dbstruct params)) + ((get-main-run-stats) (apply db:get-main-run-stats dbstruct params)) + ((get-var) (apply db:get-var dbstruct params)) + ((get-run-stats) (apply db:get-run-stats dbstruct params)) + + ;; STEPS + ((get-steps-data) (apply db:get-steps-data dbstruct params)) + ((get-steps-for-test) (apply db:get-steps-for-test dbstruct params)) + + ;; TEST DATA + ((read-test-data) (apply db:read-test-data dbstruct params)) + ((read-test-data*) (apply db:read-test-data* dbstruct params)) + + ;; MISC + ((get-latest-host-load) (apply db:get-latest-host-load dbstruct params)) + ((have-incompletes?) (apply db:have-incompletes? dbstruct params)) + ((login) (apply db:login dbstruct params)) + ((general-call) (let ((stmtname (car params)) + (run-id (cadr params)) + (realparams (cddr params))) + (db:general-call dbstruct stmtname realparams))) + ((sdb-qry) (apply sdb:qry params)) + ((ping) (current-process-id)) + ((get-changed-record-ids) (apply db:get-changed-record-ids dbstruct params)) + + ;; TESTMETA + ((testmeta-get-record) (apply db:testmeta-get-record dbstruct params)) + + ;; TASKS + ((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params)) + (else + (debug:print 0 *default-log-port* "ERROR: bad api call " cmd) + (conc "ERROR: BAD api call " cmd)))))) + + ;; save all stats + (let ((delta-t (- (current-milliseconds) + start-t))) + (hash-table-set! *db-api-call-time* cmd + (cons delta-t (hash-table-ref/default *db-api-call-time* cmd '())))) + (if writecmd-in-readonly-mode + (vector #f res) + (vector #t res))))))) ADDED emergency-patch-2.scm Index: emergency-patch-2.scm ================================================================== --- /dev/null +++ emergency-patch-2.scm @@ -0,0 +1,311 @@ +(include "common_records.scm") +(include "key_records.scm") +(include "db_records.scm") +(include "run_records.scm") +(include "test_records.scm") + +(define (common:wait-for-cpuload maxload numcpus waitdelay #!key (count 1000) (msg #f)(remote-host #f)) + (let* ((loadavg (common:get-cpu-load remote-host)) + (first (car loadavg)) + (next (cadr loadavg)) + (adjload (* maxload numcpus)) + (loadjmp (- first next))) + (cond + ((and (> first adjload) + (> count 0)) + (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load " first " exceeding max of " adjload " " (if msg msg "")) + (thread-sleep! waitdelay) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1))) + ((and (> loadjmp numcpus) + (> count 0)) + (debug:print-info 0 *default-log-port* "waiting " waitdelay " seconds due to load jump " loadjmp " > numcpus " numcpus (if msg msg "")) + (thread-sleep! waitdelay) + (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1)))))) + +(define (common:wait-for-homehost-load maxload msg) + (let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local. + #f + (common:get-homehost))) + (hh (if hh-dat (car hh-dat) #f)) + (numcpus (common:get-num-cpus hh))) + (common:wait-for-normalized-load maxload msg: msg remote-host: hh))) + +;; wait for normalized cpu load to drop below maxload +;; +(define (common:wait-for-normalized-load maxload #!key (msg #f)(remote-host #f)) + (let ((num-cpus (common:get-num-cpus remote-host))) + (common:wait-for-cpuload maxload num-cpus 15 msg: msg remote-host: remote-host))) + +;; hed tal reg reruns reglen regfull test-record runname test-name item-path jobgroup max-concurrent-jobs run-id waitons item-path testmode test-patts required-tests test-registry registry-mutex flags keyvals run-info newtal all-tests-registry itemmaps) +(define (runs:process-expanded-tests runsdat testdat) + ;; unroll the contents of runsdat and testdat (due to ongoing refactoring). + (let* ((hed (runs:testdat-hed testdat)) + (tal (runs:testdat-tal testdat)) + (reg (runs:testdat-reg testdat)) + (reruns (runs:testdat-reruns testdat)) + (test-name (runs:testdat-test-name testdat)) + (item-path (runs:testdat-item-path testdat)) + (jobgroup (runs:testdat-jobgroup testdat)) + (waitons (runs:testdat-waitons testdat)) + (item-path (runs:testdat-item-path testdat)) + (testmode (runs:testdat-testmode testdat)) + (newtal (runs:testdat-newtal testdat)) + (itemmaps (runs:testdat-itemmaps testdat)) + (test-record (runs:testdat-test-record testdat)) + (prereqs-not-met (runs:testdat-prereqs-not-met testdat)) + + (reglen (runs:dat-reglen runsdat)) + (regfull (runs:dat-regfull runsdat)) + (runname (runs:dat-runname runsdat)) + (max-concurrent-jobs (runs:dat-max-concurrent-jobs runsdat)) + (run-id (runs:dat-run-id runsdat)) + (test-patts (runs:dat-test-patts runsdat)) + (required-tests (runs:dat-required-tests runsdat)) + (test-registry (runs:dat-test-registry runsdat)) + (registry-mutex (runs:dat-registry-mutex runsdat)) + (flags (runs:dat-flags runsdat)) + (keyvals (runs:dat-keyvals runsdat)) + (run-info (runs:dat-run-info runsdat)) + (all-tests-registry (runs:dat-all-tests-registry runsdat)) + (run-limits-info (runs:dat-can-run-more-tests runsdat)) + ;; (runs:can-run-more-tests run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running + (have-resources (car run-limits-info)) + (num-running (list-ref run-limits-info 1)) + (num-running-in-jobgroup(list-ref run-limits-info 2)) + (max-concurrent-jobs (list-ref run-limits-info 3)) + (job-group-limit (list-ref run-limits-info 4)) + ;; (prereqs-not-met (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) + ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap)) + (fails (if (list? prereqs-not-met) + (runs:calc-fails prereqs-not-met) + (begin + (debug:print-error 0 *default-log-port* "prereqs-not-met is not a list! " prereqs-not-met) + '()))) + (non-completed (filter (lambda (x) ;; remove hed from not completed list, duh, of course it is not completed! + (not (equal? x hed))) + (runs:calc-not-completed prereqs-not-met))) + (loop-list (list hed tal reg reruns)) + ;; configure the load runner + (numcpus (common:get-num-cpus #f)) + (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3.0"))) ;; use a non-number string to disable + (maxhomehostload (string->number (or (configf:lookup *configdat* "jobtools" "maxhomehostload") "1.2"))) ;; use a non-number string to disable + (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) + (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" + (string-intersperse + (map (lambda (t) + (if (vector? t) + (conc (db:test-get-state t) "/" (db:test-get-status t)) + (conc " WARNING: t is not a vector=" t ))) + prereqs-not-met) + ", ") ") fails: " fails + "\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) + + + + (if (and (not (null? prereqs-not-met)) + (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) + (debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", "))) + + ;; Don't know at this time if the test have been launched at some time in the past + ;; i.e. is this a re-launch? + (debug:print-info 4 *default-log-port* "run-limits-info = " run-limits-info) + + (cond + + ;; Check item path against item-patts, + ;; + ((not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests)) ;; This test/itempath is not to be run + ;; else the run is stuck, temporarily or permanently + ;; but should check if it is due to lack of resources vs. prerequisites + (debug:print-info 1 *default-log-port* "Skipping " (tests:testqueue-get-testname test-record) " " item-path " as it doesn't match " test-patts) + (if (or (not (null? tal))(not (null? reg))) + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns) + #f)) + + ;; Register tests + ;; + ((not (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f)) + (debug:print-info 4 *default-log-port* "Pre-registering test " test-name "/" item-path " to create placeholder" ) + ;; always do firm registration now in v1.60 and greater ;; (eq? *transport-type* 'fs) ;; no point in parallel registration if use fs + (let register-loop ((numtries 15)) + (rmt:register-test run-id test-name item-path) + (if (rmt:get-test-id run-id test-name item-path) + (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'done) + (if (> numtries 0) + (begin + (thread-sleep! 0.5) + (register-loop (- numtries 1))) + (debug:print-error 0 *default-log-port* "failed to register test " (db:test-make-full-name test-name item-path))))) + (if (not (eq? (hash-table-ref/default test-registry (db:test-make-full-name test-name "") #f) 'done)) + (begin + (rmt:register-test run-id test-name "") + (if (rmt:get-test-id run-id test-name "") + (hash-table-set! test-registry (db:test-make-full-name test-name "") 'done)))) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) + (if (and (null? tal)(null? reg)) + (list hed tal (append reg (list hed)) reruns) + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + ;; NB// Here we are building reg as we register tests + ;; if regfull we must pop the front item off reg + (if regfull + (append (cdr reg) (list hed)) + (append reg (list hed))) + reruns))) + + ;; At this point hed test registration must be completed. + ;; + ((eq? (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f) + 'start) + (debug:print-info 0 *default-log-port* "Waiting on test registration(s): " + (string-intersperse + (filter (lambda (x) + (eq? (hash-table-ref/default test-registry x #f) 'start)) + (hash-table-keys test-registry)) + ", ")) + (thread-sleep! 0.051) + (list hed tal reg reruns)) + + ;; If no resources are available just kill time and loop again + ;; + ((not have-resources) ;; simply try again after waiting a second + (if (runs:lownoise "no resources" 60) + (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ...")) + ;; Have gone back and forth on this but db starvation is an issue. + ;; wait one second before looking again to run jobs. + (thread-sleep! 1) + ;; could have done hed tal here but doing car/cdr of newtal to rotate tests + (list (car newtal)(cdr newtal) reg reruns)) + + ;; This is the final stage, everything is in place so launch the test + ;; + ((and have-resources + (or (null? prereqs-not-met) + (and (member 'toplevel testmode) ;; 'toplevel) + (null? non-completed) + (not (member 'exclusive testmode))))) + ;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path)) + ;; we are going to reset all the counters for test retries by setting a new hash table + ;; this means they will increment only when nothing can be run + (set! *max-tries-hash* (make-hash-table)) + ;; well, first lets see if cpu load throttling is enabled. If so wait around until the + ;; average cpu load is under the threshold before continuing + (if maxload ;; only gate if maxload is specified + (common:wait-for-cpuload maxload numcpus waitdelay)) + (if maxhomehostload + (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) + + (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) + (runs:incremental-print-results run-id) + (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + (if (or (not (null? tal))(not (null? reg))) + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns) + #f)) + + ;; must be we have unmet prerequisites + ;; + (else + (debug:print 4 *default-log-port* "FAILS: " fails) + ;; If one or more of the prereqs-not-met are FAIL then we can issue + ;; a message and drop hed from the items to be processed. + ;; (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) + (if (and (not (null? prereqs-not-met)) + (runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60)) + (debug:print-info 1 *default-log-port* "waiting on tests; " (string-intersperse + (runs:mixed-list-testname-and-testrec->list-of-strings + prereqs-not-met) ", "))) + (if (or (null? fails) + (member 'toplevel testmode)) + (begin + ;; couldn't run, take a breather + (if (runs:lownoise "Waiting for more work to do..." 60) + (debug:print-info 0 *default-log-port* "Waiting for more work to do...")) + (thread-sleep! 1) + (list (car newtal)(cdr newtal) reg reruns)) + ;; the waiton is FAIL so no point in trying to run hed ever again + (if (or (not (null? reg))(not (null? tal))) + (if (vector? hed) + (begin + (debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path + " from the launch list as it has prerequistes that are FAIL") + (let ((test-id (rmt:get-test-id run-id hed ""))) + (if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "PREQ_FAIL" "Failed to run due to failed prerequisites"))) + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) + ;; (thread-sleep! *global-delta*) + ;; This next is for the items + (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "BLOCKED" #f) + (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'removed) + (list (runs:queue-next-hed tal reg reglen regfull) + (runs:queue-next-tal tal reg reglen regfull) + (runs:queue-next-reg tal reg reglen regfull) + reruns ;; WAS: (cons hed reruns) ;; but that makes no sense? + )) + (let ((nth-try (hash-table-ref/default test-registry hed 0))) + (cond + ((member "RUNNING" (map db:test-get-state prereqs-not-met)) + (if (runs:lownoise (conc "possible RUNNING prerequistes " hed) 60) + (debug:print 0 *default-log-port* "WARNING: test " hed " has possible RUNNING prerequisites, don't give up on it yet.")) + (thread-sleep! 4) + (list (runs:queue-next-hed newtal reg reglen regfull) + (runs:queue-next-tal newtal reg reglen regfull) + (runs:queue-next-reg newtal reg reglen regfull) + reruns)) + ((or (not nth-try) + (and (number? nth-try) + (< nth-try 10))) + (hash-table-set! test-registry hed (if (number? nth-try) + (+ nth-try 1) + 0)) + (if (runs:lownoise (conc "not removing test " hed) 60) + (debug:print 1 *default-log-port* "WARNING: not removing test " hed " from queue although it may not be runnable due to FAILED prerequisites")) + ;; may not have processed correctly. Could be a race condition in your test implementation? Dropping test " hed) ;; " as it has prerequistes that are FAIL. (NOTE: hed is not a vector)") + (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) + ;; (list hed tal reg reruns) + ;; (list (car newtal)(cdr newtal) reg reruns) + ;; (hash-table-set! test-registry hed 'removed) + (list (runs:queue-next-hed newtal reg reglen regfull) + (runs:queue-next-tal newtal reg reglen regfull) + (runs:queue-next-reg newtal reg reglen regfull) + reruns)) + ((symbol? nth-try) + (if (eq? nth-try 'removed) ;; removed is removed - drop it NOW + (if (null? tal) + #f ;; yes, really + (list (car tal)(cdr tal) reg reruns)) + (begin + (if (runs:lownoise (conc "FAILED prerequisites or other issue" hed) 60) + (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequisites or other issue. Internal state " nth-try " will be overridden and we'll retry.")) + (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "KEEP_TRYING" #f) + (hash-table-set! test-registry hed 0) + (list (runs:queue-next-hed newtal reg reglen regfull) + (runs:queue-next-tal newtal reg reglen regfull) + (runs:queue-next-reg newtal reg reglen regfull) + reruns)))) + (else + (if (runs:lownoise (conc "FAILED prerequitests and we tried" hed) 60) + (debug:print 0 *default-log-port* "WARNING: test " hed " has FAILED prerequitests and we've tried at least 10 times to run it. Giving up now.")) + ;; (debug:print 0 *default-log-port* " prereqs: " prereqs-not-met) + (hash-table-set! test-registry hed 'removed) + (mt:test-set-state-status-by-testname run-id test-name item-path "NOT_STARTED" "TEN_STRIKES" #f) + ;; I'm unclear on if this roll up is needed - it may be the root cause of the "all set to FAIL" bug. + (rmt:set-state-status-and-roll-up-items run-id test-name item-path #f "FAIL" #f) ;; treat as FAIL + (list (if (null? tal)(car newtal)(car tal)) + tal + reg + reruns))))) + ;; can't drop this - maybe running? Just keep trying + (let ((runable-tests (runs:runable-tests prereqs-not-met))) + (if (null? runable-tests) + #f ;; I think we are truly done here + (list (runs:queue-next-hed newtal reg reglen regfull) + (runs:queue-next-tal newtal reg reglen regfull) + (runs:queue-next-reg newtal reg reglen regfull) + reruns))))))))) Index: env.scm ================================================================== --- env.scm +++ env.scm @@ -12,11 +12,11 @@ (declare (unit env)) (use sql-de-lite) ;; srfi-1 posix regex regex-case srfi-69 srfi-18 call-with-environment-variables) (define (env:open-db fname) - (let* ((db-exists (file-exists? fname)) + (let* ((db-exists (common:file-exists? fname)) (db (open-database fname))) (if (not db-exists) (begin (exec (sql db "CREATE TABLE envvars ( id INTEGER PRIMARY KEY, Index: ezsteps.scm ================================================================== --- ezsteps.scm +++ ezsteps.scm @@ -36,19 +36,19 @@ (test-id (db:test-get-id testdat)) (run-id (db:test-get-run_id testdat)) (test-name (db:test-get-testname testdat)) (kill-job #f)) ;; for future use (on re-factoring with launch.scm code (let loop ((count 5)) - (if (file-exists? test-run-dir) + (if (common:file-exists? test-run-dir) (push-directory test-run-dir) (if (> count 0) (begin (debug:print 0 *default-log-port* "WARNING: ezsteps attempting to run but test run directory " test-run-dir " is not there. Waiting and trying again " count " more times") (sleep 3) (loop (- count 1)))))) (debug:print-info 0 *default-log-port* "Running in directory " test-run-dir) - (if (not (file-exists? ".ezsteps"))(create-directory ".ezsteps")) + (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) ;; if ezsteps was defined then we are sure to have at least one step but check anyway (if (not (> (length ezstepslst) 0)) (message-window "ERROR: You can only re-run steps defined via ezsteps") (begin (let loop ((ezstep (car ezstepslst)) @@ -74,11 +74,11 @@ (loop (car tal)(cdr tal) stepname #f)))) (debug:print 4 *default-log-port* "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts " stepparms: " stepparms " stepcmd: " stepcmd) - (if (file-exists? (conc stepname ".logpro"))(set! logpro-used #t)) + (if (common:file-exists? (conc stepname ".logpro"))(set! logpro-used #t)) ;; call the command using mt_ezstep (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "-") " " stepcmd)) (debug:print 4 *default-log-port* "script: " script) Index: file-tail.scm ================================================================== --- file-tail.scm +++ file-tail.scm @@ -2,11 +2,11 @@ (use (prefix sqlite3 sqlite3:) posix typed-records) (define (open-tail-db ) (let* ((basedir (create-directory (conc "/tmp/" (current-user-name)))) (dbpath (conc basedir "/megatest_logs.db")) - (dbexists (file-exists? dbpath)) + (dbexists (common:file-exists? dbpath)) (db (sqlite3:open-database dbpath)) (handler (sqlite3:make-busy-timeout 136000))) (sqlite3:set-busy-handler! db handler) (sqlite3:execute db "PRAGMA synchronous = 0;") (if (not dbexists) Index: filedb.scm ================================================================== --- filedb.scm +++ filedb.scm @@ -16,11 +16,11 @@ (include "fdb_records.scm") ;; (include "settings.scm") (define (filedb:open-db dbpath) (let* ((fdb (make-filedb:fdb)) - (dbexists (file-exists? dbpath)) + (dbexists (common:file-exists? dbpath)) (db (sqlite3:open-database dbpath))) (filedb:fdb-set-db! fdb db) (filedb:fdb-set-dbpath! fdb dbpath) (filedb:fdb-set-pathcache! fdb (make-hash-table)) (filedb:fdb-set-idcache! fdb (make-hash-table)) Index: genexample.scm ================================================================== --- genexample.scm +++ genexample.scm @@ -56,11 +56,11 @@ (begin (print "The path " path " does not exist or is not a directory. Attempting to create it now") (create-directory path #t))) ;; First check that the directory is empty! - (if (and (file-exists? path) + (if (and (common:file-exists? path) (not (null? (glob (conc path "/*"))))) (begin (print "WARNING: directory " path " is not empty, are you sure you want to continue?") (display "Enter y/n: ") (if (equal? "y" (read-line)) @@ -210,22 +210,22 @@ (scripts '()) (items '()) (rel-path #f)) (cond - ((file-exists? "megatest.config") (set! rel-path "./")) - ((file-exists? "../megatest.config") (set! rel-path "../")) - ((file-exists? "../../megatest.config") (set! rel-path "../../")) - ((file-exists? "../../../megatest.config")(set! rel-path "../../../"))) ;; good enough dang it. + ((common:file-exists? "megatest.config") (set! rel-path "./")) + ((common:file-exists? "../megatest.config") (set! rel-path "../")) + ((common:file-exists? "../../megatest.config") (set! rel-path "../../")) + ((common:file-exists? "../../../megatest.config")(set! rel-path "../../../"))) ;; good enough dang it. ;; Don't gather data or continue if a) megatest.config can't be found or b) testconfig already exists (if (not rel-path) (begin (print "ERROR: I could not find megatest.config, please run -create-test in the top dir of your megatest area") (exit 1))) - (if (file-exists? (conc rel-path "tests/" testname "/testconfig")) + (if (common:file-exists? (conc rel-path "tests/" testname "/testconfig")) (begin (print "WARNING: You already have a testconfig in " rel-path "tests/" testname ", do you want to clobber your files?") (display "Enter y/n: ") (if (not (equal? "y" (read-line))) (begin Index: http-transport.scm ================================================================== --- http-transport.scm +++ http-transport.scm @@ -294,12 +294,17 @@ (server-dat (if runremote (remote-conndat runremote) #f))) ;; (hash-table-ref/default *runremote* run-id #f))) (if (vector? server-dat) (let ((api-dat (http-transport:server-dat-get-api-uri server-dat))) - (close-connection! api-dat) - #t) + (handle-exceptions + exn + (begin + (print-call-chain *default-log-port*) + (debug:print-error 0 *default-log-port* " closing connection failed with error: " ((condition-property-accessor 'exn 'message) exn))) + (close-connection! api-dat) + #t)) #f))) (define (make-http-transport:server-dat)(make-vector 6)) (define (http-transport:server-dat-get-iface vec) (vector-ref vec 0)) @@ -426,11 +431,11 @@ (if (common:low-noise-print 60 "dbstats") (begin (debug:print 0 *default-log-port* "Server stats:") (db:print-current-query-stats))) (let* ((hrs-since-start (/ (- (current-seconds) server-start-time) 3600)) - (adjusted-timeout (if (> hrs-since-start 1) + (adjusted-timeout (if (> hrs-since-start 1) ;; never used! (- server-timeout (inexact->exact (round (* hrs-since-start 60)))) ;; subtract 60 seconds per hour server-timeout))) (if (common:low-noise-print 120 "server timeout") (debug:print-info 0 *default-log-port* "Adjusted server timeout: " adjusted-timeout)) (cond @@ -442,11 +447,12 @@ (debug:print-info 0 *default-log-port* "Server continuing, seconds since last db access: " (- (current-seconds) last-access)) (let ((curr-time (current-seconds))) (handle-exceptions exn (debug:print 0 *default-log-port* "ERROR: Failed to change timestamp on log file " server-log-file ". Are you out of space on that disk?") - (change-file-times server-log-file curr-time curr-time)))) + (if (not *server-overloaded*) + (change-file-times server-log-file curr-time curr-time))))) (loop 0 server-state bad-sync-count (current-milliseconds))) (else (debug:print-info 0 *default-log-port* "Server timed out. seconds since last db access: " (- (current-seconds) last-access)) (http-transport:server-shutdown port))))))) @@ -485,17 +491,16 @@ ;; all routes though here end in exit ... ;; ;; start_server? ;; (define (http-transport:launch) - ;; (if (args:get-arg "-daemonize") - ;; (begin - ;; (daemon:ize) - ;; (if *alt-log-file* ;; we should re-connect to this port, I think daemon:ize disrupts it - ;; (begin - ;; (current-error-port *alt-log-file*) - ;; (current-output-port *alt-log-file*))))) + ;; lets not even bother to start if there are already three or more server files ready to go + (let* ((num-alive (server:get-num-alive (server:get-list *toppath*)))) + (if (> num-alive 3) + (begin + (debug:print 0 *default-log-port* "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up") + (exit)))) (let* ((th2 (make-thread (lambda () (debug:print-info 0 *default-log-port* "Server run thread started") (http-transport:run (if (args:get-arg "-server") (args:get-arg "-server") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -58,11 +58,11 @@ ;; return (conc status ": " comment) from the final section so that ;; the comment can be set in the step record in launch.scm ;; (define (launch:load-logpro-dat run-id test-id stepname) (let ((cname (conc stepname ".dat"))) - (if (file-exists? cname) + (if (common:file-exists? cname) (let* ((dat (read-config cname #f #f)) (csvr (db:logpro-dat->csv dat stepname)) (csvt (let-values (((fmt-cell fmt-record fmt-csv) (make-format ","))) (fmt-csv (map list->csv-record csvr)))) (status (configf:lookup dat "final" "exit-status")) @@ -87,11 +87,11 @@ (script "") ; "#!/bin/bash\n") ;; yep, we depend on bin/bash FIXME!!!\ (logpro-file (conc stepname ".logpro")) (html-file (conc stepname ".html")) (dat-file (conc stepname ".dat")) (tconfig-logpro (configf:lookup testconfig "logpro" stepname)) - (logpro-used (file-exists? logpro-file))) + (logpro-used (common:file-exists? logpro-file))) (if (and tconfig-logpro (not logpro-used)) ;; no logpro file found but have a defn in the testconfig (begin (with-output-to-file logpro-file @@ -106,11 +106,11 @@ " stepparms: " stepparms " stepcmd: " stepcmd) ;; ;; first source the previous environment ;; (let ((prev-env (conc ".ezsteps/" prevstep (if (string-search (regexp "csh") ;; (get-environment-variable "SHELL")) ".csh" ".sh")))) - ;; (if (and prevstep (file-exists? prev-env)) + ;; (if (and prevstep (common:file-exists? prev-env)) ;; (set! script (conc script "source " prev-env)))) ;; call the command using mt_ezstep ;; (set! script (conc "mt_ezstep " stepname " " (if prevstep prevstep "x") " " stepcmd)) @@ -125,11 +125,11 @@ (with-output-to-file "Makefile.ezsteps" (lambda () (print stepname ".log :") (print "\t" cmd) - (if (file-exists? (conc stepname ".logpro")) + (if (common:file-exists? (conc stepname ".logpro")) (print "\tlogpro " stepname ".logpro " stepname ".html < " stepname ".log")) (print) (print stepname " : " stepname ".log") (print)) #:append) @@ -169,11 +169,11 @@ (logfna (if logpro-used (conc stepname ".html") "")) (comment #f)) (if logpro-used (let ((datfile (conc stepname ".dat"))) ;; load the .dat file into the test_data table if it exists - (if (file-exists? datfile) + (if (common:file-exists? datfile) (set! comment (launch:load-logpro-dat run-id test-id stepname))) (rmt:test-set-log! run-id test-id (conc stepname ".html")))) (rmt:teststep-set-status! run-id test-id stepname "end" exinfo comment logfna)) ;; set the test final status (let* ((process-exit-status (launch:einf-exit-code exit-info)) ;; (vector-ref exit-info 2)) @@ -293,11 +293,11 @@ ;; after all that, still no testconfig? Time to abort (if (not testconfig) (begin (debug:print-error 0 *default-log-port* "Failed to resolve megatest.config, runconfigs.config and testconfig issues. Giving up now") (exit 1))) - (if (not (file-exists? ".ezsteps"))(create-directory ".ezsteps")) + (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) ;; if ezsteps was defined then we are sure to have at least one step but check anyway (if (not (> (length ezstepslst) 0)) (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length") (let loop ((ezstep (car ezstepslst)) (tal (cdr ezstepslst)) @@ -305,20 +305,21 @@ ;; check exit-info (vector-ref exit-info 1) (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) (let ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig)) (stepname (car ezstep))) ;; if logpro-used read in the stepname.dat file - (if (and logpro-used (file-exists? (conc stepname ".dat"))) + (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) (launch:load-logpro-dat run-id test-id stepname)) (if (steprun-good? logpro-used (launch:einf-exit-code exit-info)) (if (not (null? tal)) (loop (car tal) (cdr tal) stepname)) (debug:print 4 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) (debug:print 4 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))))))) (define (launch:monitor-job run-id test-id item-path fullrunscript ezsteps test-name tconfigreg exit-info m work-area runtlim misc-flags) - (let* ((start-seconds (current-seconds)) + (let* ((update-period (string->number (or (configf:lookup *configdat* "setup" "test-stats-update-period") "30"))) + (start-seconds (current-seconds)) (calc-minutes (lambda () (inexact->exact (round (- (current-seconds) @@ -327,30 +328,38 @@ ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) - (disk-free (get-df (current-directory)))) - (let ((new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) - (delta (abs (- load cpu-load)))) - (if (> delta 0.1) ;; don't bother updating with small changes - load - #f))) - (new-disk-free (let* ((df (get-df (current-directory))) - (delta (abs (- df disk-free)))) - (if (> delta 200) ;; ignore changes under 200 Meg - df - #f)))) + (disk-free (get-df (current-directory))) + (last-sync (current-seconds))) + (let* ((over-time (> (current-seconds) (+ last-sync update-period))) + (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) + (delta (abs (- load cpu-load)))) + (if (> delta 0.1) ;; don't bother updating with small changes + load + #f))) + (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds + (get-df (current-directory)) + disk-free)) + (delta (abs (- df disk-free)))) + (if (and (> df 0) + (> (/ delta df) 0.1)) ;; (> delta 200) ;; ignore changes under 200 Meg + df + #f))) + (do-sync (or new-cpu-load new-disk-free over-time))) + (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (set! kill-job? (or (test-get-kill-request run-id test-id) ;; run-id test-name itemdat)) (and runtlim (let* ((run-seconds (- (current-seconds) start-seconds)) (time-exceeded (> run-seconds runtlim))) (if time-exceeded (begin (debug:print-info 0 *default-log-port* "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" run-seconds " seconds, limit=" runtlim) #t) #f))))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (if do-sync + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second @@ -394,11 +403,14 @@ (exit))) (if (hash-table-ref/default misc-flags 'keep-going #f) (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta - (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free))))))) + (loop (calc-minutes) + (or new-cpu-load cpu-load) + (or new-disk-free disk-free) + (if do-sync (current-seconds) last-sync))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) @@ -437,11 +449,11 @@ (fullrunscript (if (not runscript) #f (if (substring-index "/" runscript) runscript ;; use unadultered if contains slashes (let ((fulln (conc testpath "/" runscript))) - (if (and (file-exists? fulln) + (if (and (common:file-exists? fulln) (file-execute-access? fulln)) fulln runscript))))) ;; assume it is on the path ) ;; (rollup-status 0) @@ -502,11 +514,11 @@ (debug:print 0 *default-log-port* "Failed to create directory " logdir " expect problems, message: " ((condition-property-accessor 'exn 'message) exn)) (create-directory logdir #t))))) ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) - (if (or (file-exists? top-path) + (if (or (common:file-exists? top-path) (> count 10)) (change-directory top-path) (begin (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " top-path " not found") (thread-sleep! 10) @@ -537,11 +549,15 @@ ;; Do not run the test if it is REMOVING, RUNNING, KILLREQ or REMOTEHOSTSTART, ;; Mark the test as REMOTEHOSTSTART *IMMEDIATELY* ;; (let* ((test-info (rmt:get-test-info-by-id run-id test-id)) - (test-host (db:test-get-host test-info)) + (test-host (if test-info + (db:test-get-host test-info) + (begin + (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") + (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "KILLREQ" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") @@ -594,11 +610,11 @@ (list "default" target))) ;;(bb-check-path msg: "launch:execute post block 1") ;; NFS might not have propagated the directory meta data to the run host - give it time if needed (let loop ((count 0)) - (if (or (file-exists? work-area) + (if (or (common:file-exists? work-area) (> count 10)) (change-directory work-area) (begin (debug:print 0 *default-log-port* "INFO: Not starting job yet - directory " work-area " not found") (thread-sleep! 10) @@ -653,11 +669,14 @@ ;;(bb-check-path msg: "launch:execute post block 41") (runs:set-megatest-env-vars run-id inkeys: keys inkeyvals: keyvals) ;;(bb-check-path msg: "launch:execute post block 42") (set-item-env-vars itemdat) ;;(bb-check-path msg: "launch:execute post block 43") - (save-environment-as-files "megatest") + (let ((blacklist (configf:lookup *configdat* "setup" "blacklistvars"))) + (if blacklist + (save-environment-as-files "megatest" ignorevars: (string-split blacklist)) + (save-environment-as-files "megatest"))) ;;(bb-check-path msg: "launch:execute post block 44") ;; open-run-close not needed for test-set-meta-info ;; (tests:set-full-meta-info #f test-id run-id 0 work-area) ;; (tests:set-full-meta-info test-id run-id 0 work-area) (tests:set-full-meta-info #f test-id run-id 0 work-area 10) @@ -665,11 +684,11 @@ ;; (thread-sleep! 0.3) ;; NFS slowness has caused grief here (if (args:get-arg "-xterm") (set! fullrunscript "xterm") (if (and fullrunscript - (file-exists? fullrunscript) + (common:file-exists? fullrunscript) (not (file-execute-access? fullrunscript))) (system (conc "chmod ug+x " fullrunscript)))) ;; We are about to actually kick off the test ;; so this is a good place to remove the records for @@ -741,10 +760,12 @@ work-area ":\n====\n exit code " (launch:einf-exit-code exit-info) "\n" "====\n") (if (not (launch:einf-exit-status exit-info)) (exit 4)))) ))) +;; DO NOT USE - caching of configs is handled in launch:setup now. +;; (define (launch:cache-config) ;; if we have a linktree and -runtests and -target and the directory exists dump the config ;; to megatest-(current-seconds).cfg and symlink it to megatest.cfg (if (and *configdat* (or (args:get-arg "-run") @@ -756,22 +777,22 @@ (args:get-arg ":runname") (getenv "MT_RUNNAME"))) (fulldir (conc linktree "/" target "/" runname))) - (if (and linktree (file-exists? linktree)) ;; can't proceed without linktree + (if (and linktree (common:file-exists? linktree)) ;; can't proceed without linktree (begin (debug:print-info 0 *default-log-port* "Have -run with target=" target ", runname=" runname ", fulldir=" fulldir ", testpatt=" (or (args:get-arg "-testpatt") "%")) - (if (not (file-exists? fulldir)) + (if (not (common:file-exists? fulldir)) (create-directory fulldir #t)) ;; need to protect with exception handler (if (and target runname - (file-exists? fulldir)) + (common:file-exists? fulldir)) (let ((tmpfile (conc fulldir "/.megatest.cfg." (current-seconds))) (targfile (conc fulldir "/.megatest.cfg-" megatest-version "-" megatest-fossil-hash)) (rconfig (conc fulldir "/.runconfig." megatest-version "-" megatest-fossil-hash))) - (if (file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached + (if (common:file-exists? rconfig) ;; only cache megatest.config AFTER runconfigs has been cached (begin (debug:print-info 0 *default-log-port* "Caching megatest.config in " tmpfile) (if (not (common:in-running-test?)) (configf:write-alist *configdat* tmpfile)) (system (conc "ln -sf " tmpfile " " targfile)))) @@ -842,28 +863,41 @@ (toppath (or *toppath* areapath (getenv "MT_RUN_AREA_HOME"))) ;; preserve toppath (target (common:args-get-target)) (sections (if target (list "default" target) #f)) ;; for runconfigs (mtconfig (or (args:get-arg "-config") "megatest.config")) ;; allow overriding megatest.config (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) - (mtcachef (car cachefiles)) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) - (rccachef (cdr cachefiles)) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) - ) ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?))))) + ;; checking for null cachefiles should not be necessary, I was seeing error car of '(), might be a chicken bug or a red herring ... + (mtcachef (if (null? cachefiles) + #f + (car cachefiles))) ;; (and cachedir (conc cachedir "/" ".megatest.cfg-" megatest-version "-" megatest-fossil-hash))) + (rccachef (if (null? cachefiles) + #f + (cdr cachefiles)))) ;; (and cachedir (conc cachedir "/" ".runconfigs.cfg-" megatest-version "-" megatest-fossil-hash))) + ;; (cancreate (and cachedir (common:file-exists? cachedir)(file-write-access? cachedir) (not (common:in-running-test?))))) (set! *toppath* toppath) ;; This is needed when we are running as a test using CMDINFO as a datasource ;;(BB> "launch:setup-body -- cachefiles="cachefiles) (cond ;; if mtcachef exists just read it, however we need to assume toppath is available in $MT_RUN_AREA_HOME - ((and (not force-reread) mtcachef (common:file-exists? mtcachef) (get-environment-variable "MT_RUN_AREA_HOME") use-cache) + ((and (not force-reread) + mtcachef rccachef + use-cache + (get-environment-variable "MT_RUN_AREA_HOME") + (common:file-exists? mtcachef) + (common:file-exists? rccachef)) ;;(BB> "launch:setup-body -- cond branch 1 - use-cache") (set! *configdat* (configf:read-alist mtcachef)) ;;(BB> "launch:setup-body -- 1 set! *configdat*="*configdat*) (set! *runconfigdat* (configf:read-alist rccachef)) (set! *configinfo* (list *configdat* (get-environment-variable "MT_RUN_AREA_HOME"))) (set! *configstatus* 'fulldata) (set! *toppath* (get-environment-variable "MT_RUN_AREA_HOME")) *toppath*) + ;; there are no existing cached configs, do full reads of the configs and cache them ;; we have all the info needed to fully process runconfigs and megatest.config - ((and (not force-reread) mtcachef) ;; BB- why are we doing this without asking if caching is desired? + ((and ;; (not force-reread) ;; force-reread is irrelevant in the AND, could however OR it? + mtcachef + rccachef) ;; BB- why are we doing this without asking if caching is desired? ;;(BB> "launch:setup-body -- cond branch 2") (let* ((first-pass (find-and-read-config ;; NB// sets MT_RUN_AREA_HOME as side effect mtconfig environ-patt: "env-override" given-toppath: toppath @@ -912,19 +946,19 @@ sections: sections))) (cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) (mtcachef (car cachefiles)) (rccachef (cdr cachefiles))) (if rccachef (configf:write-alist runconfigdat rccachef)) - (set! *runconfigdat* runconfigdat) (if mtcachef (configf:write-alist *configdat* mtcachef)) + (set! *runconfigdat* runconfigdat) (if (and rccachef mtcachef) (set! *configstatus* 'fulldata)))) ;; no configs found? should not happen but let's try to recover gracefully, return an empty hash-table - (begin (set! *configdat* (make-hash-table)) - ;;(BB> "launch:setup-body -- 3 set! *configdat*="*configdat*) - ) + (set! *configdat* (make-hash-table)) ))) + ;; else read what you can and set the flag accordingly + ;; here we don't have either mtconfig or rccachef (else ;;(BB> "launch:setup-body -- cond branch 3 - else") (let* ((cfgdat (find-and-read-config (or (args:get-arg "-config") "megatest.config") environ-patt: "env-override" @@ -941,10 +975,12 @@ (set! *toppath* toppath) (set! *configstatus* 'partial)) (begin (debug:print-error 0 *default-log-port* "No " mtconfig " file found. Giving up.") (exit 2)))))) + ;; COND ends here. + ;; additional house keeping (let* ((linktree (common:get-linktree))) (if linktree (begin (if (not (common:file-exists? linktree)) @@ -960,11 +996,11 @@ exn (begin (debug:print-error 0 *default-log-port* "Something went wrong when trying to create link to linktree at " *toppath*) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))) (let ((tlink (conc *toppath* "/lt"))) - (if (not (file-exists? tlink)) + (if (not (common:file-exists? tlink)) (create-symbolic-link linktree tlink))))) (begin (debug:print-error 0 *default-log-port* "linktree not defined in [setup] section of megatest.config") ))) (if (and *toppath* @@ -972,20 +1008,19 @@ (begin (setenv "MT_RUN_AREA_HOME" *toppath*) (setenv "MT_TESTSUITENAME" (common:get-testsuite-name))) (begin (debug:print-error 0 *default-log-port* "failed to find the top path to your Megatest area.") - ;;(exit 1) (set! *toppath* #f) ;; force it to be false so we return #f - #f - )) + #f)) + ;; one more attempt to cache the configs for future reading (let* ((cachefiles (launch:get-cache-file-paths areapath toppath target mtconfig)) (mtcachef (car cachefiles)) (rccachef (cdr cachefiles))) - (if (and rccachef *runconfigdat*) (configf:write-alist *runconfigdat* rccachef)) - (if (and mtcachef *configdat*) (configf:write-alist *configdat* mtcachef)) + (if (and rccachef *runconfigdat* (not (common:file-exists? rccachef))) (configf:write-alist *runconfigdat* rccachef)) + (if (and mtcachef *configdat* (not (common:file-exists? mtcachef))) (configf:write-alist *configdat* mtcachef)) (if (and rccachef mtcachef *runconfigdat* *configdat*) (set! *configstatus* 'fulldata))) ;; if have -append-config then read and append here (let ((cfname (args:get-arg "-append-config"))) @@ -1099,11 +1134,11 @@ (begin (debug:print-error 0 *default-log-port* " Failed to remove symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit 1)) (delete-file lnkpath))) - (if (not (or (file-exists? lnkpath) + (if (not (or (common:file-exists? lnkpath) (symbolic-link? lnkpath))) (handle-exceptions exn (begin (debug:print-error 0 *default-log-port* " Failed to create symlink " lnkpath ((condition-property-accessor 'exn 'message) exn) ", exiting") @@ -1124,11 +1159,11 @@ (db:test-get-rundir testinfo) ;; ) ;; ) #f))) (hash-table-set! *toptest-paths* testname curr-test-path) ;; NB// Was this for the test or for the parent in an iterated test? (rmt:general-call 'test-set-rundir-shortdir run-id lnkpath - (if (file-exists? lnkpath) + (if (common:file-exists? lnkpath) ;; (resolve-pathname lnkpath) (common:nice-path lnkpath) lnkpath) testname "" run-id) ;; (rmt:general-call 'test-set-rundir run-id lnkpath testname "") ;; toptest-path) @@ -1163,11 +1198,11 @@ exn (begin (debug:print-error 0 *default-log-port* " Failed to re-create link " lnktarget ((condition-property-accessor 'exn 'message) exn) ", exiting") (exit)) (if (symbolic-link? lnktarget) (delete-file lnktarget)) - (if (not (file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) + (if (not (common:file-exists? lnktarget)) (create-symbolic-link test-path lnktarget))))) (if (not (directory? test-path)) (create-directory test-path #t)) ;; this is a hack, I don't know why out of the blue this path does not exist sometimes (if (and test-src-path (directory? test-path)) @@ -1206,11 +1241,12 @@ (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 1))) (if (> launch-delay delta) (begin - (debug:print-info 0 *default-log-port* "Delaying launch of " test-name " for " (- launch-delay delta) " seconds") + (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. + (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) (change-directory *toppath*) (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) (append @@ -1329,11 +1365,11 @@ (list 'mt-bindir-path mt-bindir-path)))))))) ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway - (if (file-exists? work-area) + (if (common:file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir (cond ;; ((and launcher hosts) ;; must be using ssh hostname ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) Index: lock-queue.scm ================================================================== --- lock-queue.scm +++ lock-queue.scm @@ -32,11 +32,11 @@ (let ((fname (lock-queue:db-dat-get-path dbdat))) (system (conc "rm -f " fname "*")))) (define (lock-queue:open-db fname #!key (count 10)) (let* ((actualfname (conc fname ".lockdb")) - (dbexists (file-exists? actualfname)) + (dbexists (common:file-exists? actualfname)) (db (sqlite3:open-database actualfname)) (handler (make-busy-timeout 136000))) (if dbexists (vector db actualfname) (begin @@ -163,12 +163,12 @@ ;; If we've tried ten times and failed there is a serious problem ;; try to remove the lock db and allow it to be recreated (handle-exceptions exn #f - (if (file-exists? journal)(delete-file journal)) - (if (file-exists? fname) (delete-file fname)) + (if (common:file-exists? journal)(delete-file journal)) + (if (common:file-exists? fname) (delete-file fname)) #f)))) (sqlite3:execute (lock-queue:db-dat-get-db dbdat) "DELETE FROM runlocks WHERE test_id=?;" test-id) (sqlite3:finalize! (lock-queue:db-dat-get-db dbdat))))) (define (lock-queue:steal-lock dbdat test-id #!key (count 10)) Index: megatest-version.scm ================================================================== --- megatest-version.scm +++ megatest-version.scm @@ -1,7 +1,7 @@ ;; Always use two or four digit decimal ;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00.. (declare (unit megatest-version)) -(define megatest-version 1.6501) +(define megatest-version 1.6502) Index: megatest.config ================================================================== --- megatest.config +++ megatest.config @@ -1,11 +1,11 @@ -# [fields] -# a text -# b text -# c text -# control over usercode location not implemented, for now must be .mtutil.scm +## commented out due to a bug in v1.6501 in mtutil +## [fields] +## a text +## b text +## c text usercode .mtutil.scm areafilter area-to-run targtrans generic-target-translator runtrans generic-runname-translator @@ -17,17 +17,29 @@ # someqa path=../megatestqa/someqa; targtrans=somefunc; areafilter=area-to-run fullrun path=tests/fullrun; # targtrans is name of scheme proc stored in .mtutil.scm, which lives in PWD where mtutil is run # the target translator can return: a/target OR (list/of targets/to apply/run) OR #f i.e. run nothing # ext-tests path=ext-tests; targtrans=prefix-contour; -ext-tests path=ext-tests +ext path=ext-tests [contours] # mode-patt/tag-expr -quick areas=ext-tests; selector=/QUICKPATT -# quick areas=fullrun,ext-tests; selector=QUICKPATT/quick +quick areas=ext; selector=/QUICKPATT +quick2 areafn=check-area; selector=/QUICKPATT +# quick areas=fullrun,ext-tests; selector=QUICKPATT/quick # full areas=fullrun,ext-tests; selector=MAXPATT/ # short areas=fullrun,ext-tests; selector=MAXPATT/ # all areas=fullrun,ext-tests # snazy selector=QUICKPATT/ [nopurpose] + +[access] +ext matt:admin mattw:owner + +[accesstypes] +admin run rerun resume remove set-ss +owner run rerun resume remove +jerk set-ss + +[setup] +maxload 1.2 Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -52,11 +52,11 @@ (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.megatestrc"))) - (if (file-exists? debugcontrolf) + (if (common:file-exists? debugcontrolf) (load debugcontrolf))) ;; Disabled help items ;; -rollup : (currently disabled) fill run (set by :runname) with latest test(s) ;; from prior runs with same keys @@ -97,10 +97,11 @@ -runname : required, name for this particular test run -state : Applies to runs, tests or steps depending on context -status : Applies to runs, tests or steps depending on context --modepatt key : load testpatt from in runconfigs instead of default TESTPATT if -testpatt and -tagexpr are not specified -tagexpr tag1,tag2%,.. : select tests with tags matching expression + Test helpers (for use inside tests) -step stepname -test-status : set the state and status of a test (use :state and :status) -setlog logfname : set the path/filename to the final log relative to the test @@ -138,10 +139,11 @@ -section sectionName -var varName : for config and runconfig lookup value for sectionName varName -since N : get list of runs changed since time N (Unix seconds) -fields fieldspec : fields to include in json dump; runs:id,runame+tests:testname+steps -sort fieldname : in -list-runs sort tests by this field + -testdata-csv [categorypatt/]varpatt : dump testdata for given category Misc -start-dir path : switch to this directory before running megatest -contour cname : add a level of hierarcy to the linktree and run paths -rebuild-db : bring the database schema up to date @@ -226,10 +228,11 @@ ":state" "-state" ":status" "-status" "-list-runs" + "-testdata-csv" "-testpatt" "--modepatt" "-tagexpr" "-itempatt" "-setlog" @@ -370,11 +373,11 @@ (debug:print-error 0 *default-log-port* "Unrecognised arguments: " (string-intersperse (if (list? remargs) remargs (argv)) " "))) ;; before doing anything else change to the start-dir if provided ;; (if (args:get-arg "-start-dir") - (if (file-exists? (args:get-arg "-start-dir")) + (if (common:file-exists? (args:get-arg "-start-dir")) (let ((fullpath (common:real-path (args:get-arg "-start-dir")))) (setenv "PWD" fullpath) (change-directory fullpath)) (begin (debug:print-error 0 *default-log-port* "non-existant start dir " (args:get-arg "-start-dir") " specified, exiting.") @@ -387,16 +390,25 @@ ;; The watchdog is to keep an eye on things like db sync etc. ;; ;; TODO: for multiple areas, we will have multiple watchdogs; and multiple threads to manage -(define *watchdog* (make-thread common:watchdog "Watchdog thread")) +(define *watchdog* (make-thread + (lambda () + (handle-exceptions + exn + (begin + (print-call-chain) + (print " message: " ((condition-property-accessor 'exn 'message) exn))) + (common:watchdog))) + "Watchdog thread")) ;;(if (not (args:get-arg "-server")) ;; (thread-start! *watchdog*)) ;; if starting a server; wait till we get to running state before kicking off watchdog (let* ((no-watchdog-args '("-list-runs" + "-testdata-csv" "-list-servers" "-server" "-list-disks" "-list-targets" "-show-runconfig" @@ -455,11 +467,11 @@ (let* ((htmlviewercmd (or (configf:lookup *configdat* "setup" "htmlviewercmd") (common:which '("firefox" "arora")))) (install-home (common:get-install-area)) (manual-html (conc install-home "/share/docs/megatest_manual.html"))) (if (and install-home - (file-exists? manual-html)) + (common:file-exists? manual-html)) (system (conc "(" htmlviewercmd " " manual-html " ) &")) (system (conc "(" htmlviewercmd " http://www.kiatoa.com/cgi-bin/fossils/megatest/doc/tip/docs/manual/megatest_manual.html ) &"))) (exit))) (if (args:get-arg "-version") @@ -550,11 +562,15 @@ ;; handle a clean-cache request as early as possible ;; (if (args:get-arg "-clean-cache") (let ((toppath (launch:setup))) (set! *didsomething* #t) ;; suppress the help output. - (runs:clean-cache (getenv "MT_TARGET")(args:get-arg "-runname") toppath))) + (runs:clean-cache (or (getenv "MT_TARGET") + (args:get-arg "-target") + (args:get-arg "-remtarg")) + (args:get-arg "-runname") + toppath))) (if (args:get-arg "-env2file") (begin (save-environment-as-files (args:get-arg "-env2file")) (set! *didsomething* #t))) @@ -706,11 +722,11 @@ (else (loop row (+ col 1) (append curr-row (list val)) result))))))))) (hash-table-keys results)))) ((sqlite3) (let* ((db-file (or out-file (pathname-file input-db))) - (db-exists (file-exists? db-file)) + (db-exists (common:file-exists? db-file)) (db (sqlite3:open-database db-file))) (if (not db-exists)(sqlite3:execute db "CREATE TABLE data (sheet,section,var,val);")) (configf:map-all-hier-alist data (lambda (sheetname sectionname varname val) @@ -854,11 +870,11 @@ (let* ((rundir (if (and (getenv "MT_LINKTREE")(getenv "MT_TARGET")(getenv "MT_RUNNAME")) (conc (getenv "MT_LINKTREE") "/" (getenv "MT_TARGET") "/" (getenv "MT_RUNNAME")) #f)) (cfgf (if rundir (conc rundir "/.runconfig." megatest-version "-" megatest-fossil-hash) #f))) (if (and cfgf - (file-exists? cfgf) + (common:file-exists? cfgf) (file-write-access? cfgf) (common:use-cache?)) (configf:read-alist cfgf) (let* ((keys (rmt:get-keys)) (target (common:args-get-target)) @@ -878,11 +894,12 @@ (begin (if (not (common:in-running-test?)) (configf:write-alist data cfgf)) ;; force re-read of megatest.config - this resolves circular references between megatest.config (launch:setup force-reread: #t) - (launch:cache-config))) ;; we can safely cache megatest.config since we have a valid runconfig + ;; (launch:cache-config) ;; there are two independent config cache locations, turning this one off for now. MRW. + )) ;; we can safely cache megatest.config since we have a valid runconfig data)))) (if (args:get-arg "-show-runconfig") (let ((tl (launch:setup))) (push-directory *toppath*) @@ -1039,10 +1056,114 @@ (if indx (if (>= indx (vector-length datavec)) #f ;; index too high, should raise an error I suppose (vector-ref datavec indx)) #f))) + + + + + +(when (args:get-arg "-testdata-csv") + (if (launch:setup) + (let* ((keys (rmt:get-keys)) ;; (db:get-keys dbstruct)) + (runpatt (or (args:get-arg "-runname") "%")) + (testpatt (common:args-get-testpatt #f)) + (datapatt (args:get-arg "-testdata-csv")) + (match-data (string-match "^([^/]+)/(.*)" (args:get-arg "-testdata-csv"))) + (categorypatt (if match-data (list-ref match-data 1) "%")) + (setvarpatt (if match-data + (list-ref match-data 2) + (args:get-arg "-testdata-csv"))) + (runsdat (rmt:get-runs-by-patt keys (or runpatt "%") + (common:args-get-target) #f #f '("id" "runname" "state" "status" "owner" "event_time" "comment") 0)) + (header (db:get-header runsdat)) + (access-mode (db:get-access-mode)) + (testpatt (common:args-get-testpatt #f)) + (fields-spec (if (args:get-arg "-fields") + (extract-fields-constraints (args:get-arg "-fields")) + (list (cons "runs" (append keys (list "id" "runname" "state" "status" "owner" "event_time" "comment" "fail_count" "pass_count"))) + (cons "tests" db:test-record-fields) ;; "id" "testname" "test_path") + (list "steps" "id" "stepname")))) + (tests-spec (let ((t (alist-ref "tests" fields-spec equal?))) + (if (and t (null? t)) ;; all fields + db:test-record-fields + t))) + (adj-tests-spec (delete-duplicates (if tests-spec (cons "id" tests-spec) db:test-record-fields))) + (test-field-index (make-hash-table)) + (runs (db:get-rows runsdat)) + ) + (if (and tests-spec (not (null? tests-spec))) ;; do some validation and processing of the test-spec + (let ((invalid-tests-spec (filter (lambda (x)(not (member x db:test-record-fields))) tests-spec))) + (if (null? invalid-tests-spec) + ;; generate the lookup map test-field-name => index-number + (let loop ((hed (car adj-tests-spec)) + (tal (cdr adj-tests-spec)) + (idx 0)) + (hash-table-set! test-field-index hed idx) + (if (not (null? tal))(loop (car tal)(cdr tal)(+ idx 1)))) + (begin + (debug:print-error 0 *default-log-port* "Invalid test fields specified: " (string-intersperse invalid-tests-spec ", ")) + (exit))))) + (let* ((table-header (string-split "target,run,test,itempath,category,var,value,comment" ",")) + (table-rows + (apply append (map + (lambda (run) + (let* ((target (string-intersperse (map (lambda (x) + (db:get-value-by-header run header x)) + keys) "/")) + (statuses (string-split (or (args:get-arg "-status") "") ",")) + (run-id (db:get-value-by-header run header "id")) + (runname (db:get-value-by-header run header "runname")) + (states (string-split (or (args:get-arg "-state") "") ",")) + (tests (if tests-spec + (db:dispatch-query access-mode rmt:get-tests-for-run db:get-tests-for-run run-id testpatt states statuses #f #f #f 'testname 'asc ;; (db:get-tests-for-run dbstruct run-id testpatt '() '() #f #f #f 'testname 'asc + ;; use qryvals if test-spec provided + (if tests-spec + (string-intersperse adj-tests-spec ",") + ;; db:test-record-fields + #f) + #f + 'normal) + '()))) + (apply append + (map + (lambda (test) + (let* ( + (test-id (if (member "id" tests-spec)(get-value-by-fieldname test test-field-index "id" ) #f)) ;; (db:test-get-id test)) + (testname (if (member "testname" tests-spec)(get-value-by-fieldname test test-field-index "testname" ) #f)) ;; (db:test-get-testname test)) + (itempath (if (member "item_path" tests-spec)(get-value-by-fieldname test test-field-index "item_path" ) #f)) ;; (db:test-get-item-path test)) + (fullname (conc testname + (if (equal? itempath "") + "" + (conc "/" itempath )))) + (testdat-raw (map vector->list (rmt:read-test-data* run-id test-id categorypatt setvarpatt))) + (testdat (filter + (lambda (x) + (not (equal? "logpro" + (list-ref x 10)))) + testdat-raw))) + (map + (lambda (item) + (receive (id test_id category + variable value expected + tol units comment status type) + (apply values item) + (list target runname testname itempath category variable value comment))) + testdat))) + tests)))) + runs)))) + (print (string-join table-header ",")) + (for-each (lambda(table-row) + (print (string-join (map ->string table-row) ","))) + + + table-rows)))) + (set! *didsomething* #t) + (set! *time-to-exit* #t)) + + ;; NOTE: list-runs and list-db-targets operate on local db!!! ;; ;; IDEA: megatest list -runname blah% ... ;; @@ -1577,11 +1698,11 @@ (let* ((keys (rmt:get-keys)) ;; db:test-get-paths must not be run remote (paths (tests:test-get-paths-matching keys target (args:get-arg "-test-files")))) (set! *didsomething* #t) (for-each (lambda (path) - (if (file-exists? path) + (if (common:file-exists? path) (print path))) paths))) ;; else do a general-run-call (general-run-call "-test-files" @@ -1867,11 +1988,13 @@ (if (not (launch:setup)) (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") (exit 1))) ;; keep this one local - (open-run-close patch-db #f) + ;; (open-run-close patch-db #f) + (let ((dbstruct (db:setup #f areapath: *toppath*))) + (common:cleanup-db dbstruct full: #t)) (set! *didsomething* #t))) (if (args:get-arg "-cleanup-db") (begin (if (not (launch:setup)) Index: mt.scm ================================================================== --- mt.scm +++ mt.scm @@ -127,47 +127,90 @@ (cons testn res))))))))) ;;====================================================================== ;; T R I G G E R S ;;====================================================================== + +(define (mt:run-trigger cmd test-id test-rundir trigger logname test-name item-path event-time actual-state actual-status) + ;; Putting the commandline into ( )'s means no control over the shell. + ;; stdout and stderr will be caught in the NBFAKE or mt_launch.log files + ;; or equivalent. No need to do this. Just run it? + (let* ((fullcmd (conc "nbfake " + cmd " " + test-id " " + test-rundir " " + trigger " " + test-name " " + item-path " " ;; has / prepended to deal with toplevel tests + actual-state " " + actual-status " " + event-time + )) + (prev-nbfake-log (get-environment-variable "NBFAKE_LOG"))) + (setenv "NBFAKE_LOG" (conc (cond + ((and (directory-exists? test-rundir) + (file-write-access? test-rundir)) + test-rundir) + ((and (directory-exists? *toppath*) + (file-write-access? *toppath*)) + *toppath*) + (else (conc "/tmp/" (current-user-name)))) + "/" logname)) + (debug:print-info 0 *default-log-port* "TRIGGERED on " trigger ", running command " fullcmd " output at " (get-environment-variable "NBFAKE_LOG")) + ;; (call-with-environment-variables + ;; `(("NBFAKE_LOG" . ,(conc test-rundir "/" logname))) + ;; (lambda () + (process-run fullcmd) + (if prev-nbfake-log + (setenv "NBFAKE_LOG" prev-nbfake-log) + (unsetenv "NBFAKE_LOG")) + )) ;; )) (define (mt:process-triggers dbstruct run-id test-id newstate newstatus) - (let* ((test-dat (db:get-test-info-by-id dbstruct run-id test-id))) - (if test-dat - (let* ((test-rundir ;; (rmt:sdb-qry 'getstr ;; (filedb:get-path *fdb* - (db:test-get-rundir test-dat)) ;; ) ;; ) - (test-name (db:test-get-testname test-dat)) - (tconfig #f) - (state (if newstate newstate (db:test-get-state test-dat))) - (status (if newstatus newstatus (db:test-get-status test-dat)))) - (if (and test-name - test-rundir ;; #f means no dir set yet - (file-exists? test-rundir) - (directory? test-rundir)) - (call-with-environment-variables - (list (cons "MT_TEST_NAME" test-name) - (cons "MT_TEST_RUN_DIR" test-rundir) - (cons "MT_ITEMPATH" (db:test-get-item-path test-dat))) - (lambda () - (push-directory test-rundir) - (set! tconfig (mt:lazy-read-test-config test-name)) - (for-each (lambda (trigger) - (let ((cmd (configf:lookup tconfig "triggers" trigger)) - (logf (conc test-rundir "/last-trigger.log"))) - (if cmd - ;; Putting the commandline into ( )'s means no control over the shell. - ;; stdout and stderr will be caught in the NBFAKE or mt_launch.log files - ;; or equivalent. No need to do this. Just run it? - (let ((fullcmd (conc cmd " " test-id " " test-rundir " " trigger "&"))) - (debug:print-info 0 *default-log-port* "TRIGGERED on " trigger ", running command " fullcmd) - (process-run fullcmd))))) - (list - (conc state "/" status) - (conc state "/") - (conc "/" status))) - (pop-directory)) - )))))) + (if test-id + (let* ((test-dat (db:get-test-info-by-id dbstruct run-id test-id))) + (if test-dat + (let* ((test-rundir (db:test-get-rundir test-dat)) ;; ) ;; ) + (test-name (db:test-get-testname test-dat)) + (item-path (db:test-get-item-path test-dat)) + (duration (db:test-get-run_duration test-dat)) + (comment (db:test-get-comment test-dat)) + (event-time (db:test-get-event_time test-dat)) + (tconfig #f) + (state (if newstate newstate (db:test-get-state test-dat))) + (status (if newstatus newstatus (db:test-get-status test-dat)))) + ;; (mutex-lock! *triggers-mutex*) + (if (and test-name + test-rundir) ;; #f means no dir set yet + ;; (common:file-exists? test-rundir) + ;; (directory? test-rundir)) + (call-with-environment-variables + (list (cons "MT_TEST_NAME" (or test-name "no such test")) + (cons "MT_TEST_RUN_DIR" (or test-rundir "no test directory yet")) + (cons "MT_ITEMPATH" (or item-path ""))) + (lambda () + (if (directory-exists? test-rundir) + (push-directory test-rundir) + (push-directory *toppath*)) + (set! tconfig (mt:lazy-read-test-config test-name)) + (for-each (lambda (trigger) + (let* ((munged-trigger (string-translate trigger "/ " "--")) + (logname (conc "last-trigger-" munged-trigger ".log"))) + ;; first any triggers from the testconfig + (let ((cmd (configf:lookup tconfig "triggers" trigger))) + (if cmd (mt:run-trigger cmd test-id test-rundir trigger (conc "tconfig-" logname) test-name item-path event-time state status))) + ;; next any triggers from megatest.config + (let ((cmd (configf:lookup *configdat* "triggers" trigger))) + (if cmd (mt:run-trigger cmd test-id test-rundir trigger (conc "mtconfig-" logname) test-name item-path event-time state status))))) + (list + (conc state "/" status) + (conc state "/") + (conc "/" status))) + (pop-directory)) + )) + ;; (mutex-unlock! *triggers-mutex*) + ))))) ;;====================================================================== ;; S T A T E A N D S T A T U S F O R T E S T S ;;====================================================================== @@ -206,11 +249,11 @@ (let ((test-dirs (tests:get-tests-search-path *configdat*))) (let loop ((hed (car test-dirs)) (tal (cdr test-dirs))) ;; Setting MT_LINKTREE here is almost certainly unnecessary. (let ((tconfig-file (conc hed "/" test-name "/testconfig"))) - (if (and (file-exists? tconfig-file) + (if (and (common:file-exists? tconfig-file) (file-read-access? tconfig-file)) (let ((link-tree-path (common:get-linktree)) ;; (configf:lookup *configdat* "setup" "linktree")) (old-link-tree (get-environment-variable "MT_LINKTREE"))) (if link-tree-path (setenv "MT_LINKTREE" link-tree-path)) (let ((newtcfg (read-config tconfig-file #f #f))) ;; NOTE: Does NOT run [system ...] Index: mtut.scm ================================================================== --- mtut.scm +++ mtut.scm @@ -13,11 +13,12 @@ ;; fake out readline usage of toplevel-command (define (toplevel-command . a) #f) (use srfi-1 posix srfi-69 readline ;; regex regex-case srfi-69 apropos json http-client directory-utils rpc typed-records;; (srfi 18) extras) srfi-18 extras format pkts regex regex-case - (prefix dbi dbi:)) ;; zmq extras) + (prefix dbi dbi:) + (prefix nanomsg nmsg:)) (declare (uses common)) (declare (uses megatest-version)) (declare (uses margs)) (declare (uses configf)) @@ -48,15 +49,15 @@ (let* ((xlatr-key (or xlatr-key-in (conf-get/default mtconf aval-alist 'targtrans))) (proc (hash-table-ref/default *target-mappers* xlatr-key #f))) (if proc (begin - (print "Using target mapper: " area-xlatr) + (print "Using target mapper: " xlatr-key) (handle-exceptions exn (begin - (print "FAILED TO RUN TARGET MAPPER FOR " area ", called " area-xlatr) + (print "FAILED TO RUN TARGET MAPPER FOR " area ", called " xlatr-key) (print " function is: " (hash-table-ref/default *target-mappers* xlatr-key #f ) ) (print " message: " ((condition-property-accessor 'exn 'message) exn)) runkey) (proc runkey area contour))) (begin @@ -81,14 +82,14 @@ ;; ii. Check that we are in a legal megatest area? ;; iii. Have some form of authentication or record of the md5sum or similar of the file? ;; iv. Use compiled version in preference to .scm version. Thus there is a manual "blessing" ;; required to use .mtutil.scm. ;; -(if (file-exists? "megatest.config") - (if (file-exists? ".mtutil.so") +(if (common:file-exists? "megatest.config") + (if (common:file-exists? ".mtutil.so") (load ".mtutil.so") - (if (file-exists? ".mtutil.scm") + (if (common:file-exists? ".mtutil.scm") (load ".mtutil.scm")))) ;; Disabled help items ;; -rollup : (currently disabled) fill run (set by :runname) with latest test(s) ;; from prior runs with same keys @@ -157,17 +158,20 @@ Version " megatest-version ", built from " megatest-fossil-hash )) ;; args and pkt key specs ;; (define *arg-keys* + ;; used keys + ;; a - action '( ("-area" . G) ;; maps to group ("-contour" . c) ("-append-config" . d) ("-state" . e) ("-item-patt" . i) ("-sync-to" . k) + ("-new" . l) ;; l (see below) is new-ss ("-run-name" . n) ("-mode-patt" . o) ("-test-patt" . p) ;; idea, enhance margs ("-test-patt" "-testpatt") => yields one value in "-test-patt" ("-status" . s) ("-target" . t) @@ -198,11 +202,12 @@ ;; alist to map actions to old megatest commands (define *action-keys* '((run . "-run") (sync . "") (archive . "-archive") - (set-ss . "-set-state-status"))) + (set-ss . "-set-state-status") + (remove . "-remove-runs"))) ;; Card types: ;; ;; A action ;; U username (Unix) @@ -220,10 +225,11 @@ (U . user ) ;; username (Z . shar1sum ) ;; Extras (a . runkey ) ;; needed for matching up pkts with target derived from runkey + ;; (l . new-ss ) ;; new state/status )) ;; inlst is an alternative input ;; (define (lookup-param-by-key key #!key (inlst #f)) @@ -247,11 +253,12 @@ (or (alist-ref (string->symbol param) '((-tag-expr . "-tagexpr") (-mode-patt . "--modepatt") (-run-name . "-runname") (-test-patt . "-testpatt") - (-msg . "-m"))) + (-msg . "-m") + (-new . "-set-state-status"))) param)) (define (val->alist val) (let ((val-list (string-split-fields ";\\s*" val #:infix))) (if val-list @@ -278,11 +285,11 @@ (print "ERROR: failed to create directory " dest-dir " message: " ((condition-property-accessor 'exn 'message) exn)) (create-directory dest-dir #t)) (handle-exceptions exn (print "ERROR: failed to clone or sync 1ossil " url " message: " ((condition-property-accessor 'exn 'message) exn)) - (if (file-exists? targ-file) + (if (common:file-exists? targ-file) (system (conc "fossil pull --once " url " -R " targ-file)) (system (conc "fossil clone " url " " targ-file)) )))) (define (fossil:last-change-node-and-time fossils-dir fossil-name branch) @@ -372,10 +379,12 @@ (begin (print help) (exit 1))) ;;====================================================================== + + ;; Runs ;;====================================================================== ;; make a runname ;; @@ -386,25 +395,34 @@ ;; collect, translate, collate and assemble a pkt from the command-line ;; ;; sched => force the run start time to be recorded as sched Unix ;; epoch. This aligns times properly for triggers in some cases. ;; -(define (command-line->pkt action args-alist sched-in #!key (extra-dat '())) +;; extra-dat format is ( 'x xval 'y yval .... ) +;; +(define (command-line->pkt action args-alist sched-in #!key (extra-dat '())(area-path #f)(new-ss #f)) (let* ((sched (cond ((vector? sched-in)(local-time->seconds sched-in)) ;; we recieved a time ((number? sched-in) sched-in) (else (current-seconds)))) (args-data (if args-alist (if (hash-table? args-alist) ;; seriously? (hash-table->alist args-alist) args-alist) (hash-table->alist args:arg-hash))) ;; if no args-alist then we assume this is a call driven directly by commandline - (alldat (apply append (list 'T "cmd" - 'A action - 'U (current-user-name) - 'D sched) - extra-dat + (alldat (apply append + (list 'A action + 'U (current-user-name) + 'D sched) + (if area-path + (list 'S area-path) ;; the area-path is mapped to the start-dir + '()) + (if (list? extra-dat) + extra-dat + (begin + (debug:print 0 *default-log-port* "ERROR: command-line->pkt received bad extra-dat " extra-dat) + '())) (map (lambda (x) (let* ((param (car x)) (value (cdr x)) (pmeta (assoc param *arg-keys*)) ;; translate the card key to a megatest switch or parameter (smeta (assoc param *switch-keys*)) ;; first lookup the key in arg-keys or switch-keys @@ -527,17 +545,22 @@ (areas-procname (alist-ref 'areafn val-alist))) (if areas-procname ;; areas-procname take precedence areas-procname (string-split (or areas-string "") ",")))) -(define (area-allowed? area areas runkey contour) +;; area - the current area under consideration +;; areas - the list of allowed areas from the contour spec -OR- +;; if it is a string then it is the function to use to +;; lookup in *area-checkers* +;; +(define (area-allowed? area areas runkey contour mode-patt) (cond ((not areas) #t) ;; no spec ((string? areas) ;; - (let ((check-fn (hash-table-ref/default *area-checkers* areas #f))) + (let ((check-fn (hash-table-ref/default *area-checkers* (string->symbol areas) #f))) (if check-fn - (check-fn area runkey contour) + (check-fn area runkey contour mode-patt) #f))) ((list? areas)(member area areas)) (else #f))) ;; shouldn't get here ;; (use trace)(trace create-run-pkt) @@ -818,19 +841,19 @@ (mode-tag (and selector (string-split-fields "/" selector #:infix))) (mode-patt (and mode-tag (if (eq? (length mode-tag) 2)(cadr mode-tag) #f))) (tag-expr (and mode-tag (if (null? mode-tag) #f (car mode-tag))))) (print "contour: " contour " areas=" areas " cval=" cval) (for-each - (lambda (runkeydatset) + (lambda (runkeydatset) ;; (print "runkeydatset: ")(pp runkeydatset) (let ((runkey (car runkeydatset)) (runkeydats (cadr runkeydatset))) (for-each (lambda (runkeydat) (for-each (lambda (area) - (if (area-allowed? area areas runkey contour) ;; is this area to be handled (from areas=a,b,c OR using areafn=abcfn and *area-checks* ...) + (if (area-allowed? area areas runkey contour mode-patt) ;; is this area to be handled (from areas=a,b,c OR using areafn=abcfn and *area-checks* ...) (let* ((aval (or (configf:lookup mtconf "areas" area) "")) (aval-alist (val->alist aval)) (runname (alist-ref 'runname runkeydat)) (runtrans (alist-ref 'runtrans runkeydat)) @@ -864,11 +887,14 @@ ;; (print "res=" res) res)))) (hash-table-keys torun))))))) (define (pkt->cmdline pkta) - (let ((action (or (lookup-action-by-key (alist-ref 'A pkta)) "noaction"))) + (let* ((action (or (lookup-action-by-key (alist-ref 'A pkta)) "noaction")) + (action-param (case (string->symbol action) + ((-set-state-status) (conc (alist-ref 'l pkta) " ")) + (else "")))) (fold (lambda (a res) (let* ((key (car a)) ;; get the key name (val (cdr a)) (par (or (lookup-param-by-key key) ;; need to check also if it is a switch (lookup-param-by-key key inlst: *switch-keys*)))) @@ -879,11 +905,11 @@ res (begin (print "ERROR: Unknown key in packet \"" key "\" with value \"" val "\"") res))))) (conc "megatest " (if (not (member action '("sync"))) - (conc action " ") + (conc action " " action-param) "")) pkta))) ;; (use trace)(trace pkt->cmdline) @@ -906,11 +932,15 @@ #f (create-directory "logs") #t) #t) "logs" - "/tmp"))) + "/tmp")) + (cpuload (alist-ref 'adj-proc-load (common:get-normalized-cpu-load #f))) + (maxload (string->number (or (configf:lookup mtconf "setup" "maxload") + (configf:lookup mtconf "jobtools" "maxload") ;; respect value used by Megatest calls + "1.1")))) (common:with-queue-db mtconf (lambda (pktsdirs pktsdir pdb) (let* ((rgconfdat (find-and-read-config (conc toppath "/runconfigs.config"))) (rgconf (car rgconfdat)) @@ -923,51 +953,115 @@ (lambda (pktdat) (let* ((pkta (alist-ref 'apkt pktdat)) (action (alist-ref 'A pkta)) (cmdline (pkt->cmdline pkta)) (uuid (alist-ref 'Z pkta)) + (user (alist-ref 'U pkta)) + (area (alist-ref 'G pkta)) (logf (conc logdir "/" uuid "-run.log")) (fullcmd (conc "NBFAKE_LOG=" logf " nbfake " cmdline))) - (print "RUNNING: " fullcmd) - (system fullcmd) - (mark-processed pdb (list (alist-ref 'id pktdat))) - (let-values (((ack-uuid ack-pkt) - (add-z-card - (construct-sdat 'P uuid - 'T (case (string->symbol action) - ((run) "runstart") - ((sync) "syncstart") ;; example of translating run -> runstart - (else action)) - 'c (alist-ref 'o pkta) ;; THIS IS WRONG! SHOULD BE 'c - 't (alist-ref 't pkta))))) - (write-pkt pktsdir ack-uuid ack-pkt)))) + (if (check-access user mtconf action area) + (if (and (> cpuload maxload) + (member action '("run" "archive"))) ;; do not run archive or run if load is over the specified limit + (print "WARNING: cpuload too high, skipping processing of " uuid) + (begin + (print "RUNNING: " fullcmd) + (system fullcmd) + (mark-processed pdb (list (alist-ref 'id pktdat))) + (let-values (((ack-uuid ack-pkt) + (add-z-card + (construct-sdat 'P uuid + 'T (case (string->symbol action) + ((run) "runstart") + ((sync) "syncstart") ;; example of translating run -> runstart + (else action)) + 'c (alist-ref 'o pkta) ;; THIS IS WRONG! SHOULD BE 'c + 't (alist-ref 't pkta))))) + (write-pkt pktsdir ack-uuid ack-pkt)))) + (begin ;; access denied! Mark as such + (mark-processed pdb (list (alist-ref 'id pktdat))) + (let-values (((ack-uuid ack-pkt) + (add-z-card + (construct-sdat 'P uuid + 'T "access-denied" + 'c (alist-ref 'o pkta) ;; THIS IS WRONG! SHOULD BE 'c + 't (alist-ref 't pkta))))) + (write-pkt pktsdir ack-uuid ack-pkt)))))) pkts)))))) - + +(define (check-access user mtconf action area) + ;; NOTE: Need control over defaults. E.g. default might be no access + (let* ((access-ctrl (hash-table-exists? mtconf "access")) ;; if there is an access section the default is to REQUIRE enablement/access + (access-list (map (lambda (x) + (string-split x ":")) + (string-split (or (configf:lookup mtconf "access" area) ;; userid:rightstype userid2:rightstype2 ... + (if access-ctrl + "*:none" ;; nobody has access by default + "*:all"))))) + (access-types-dat (configf:get-section mtconf "accesstypes"))) + (debug:print 0 *default-log-port* "Checking access in " access-list " with access-ctrl " access-ctrl " for area " area) + (if access-ctrl + (let* ((user-access (or (assoc user access-list) + (assoc "*" access-list))) + (access-type (cadr user-access)) + (access-types (let ((res (alist-ref access-type access-types-dat equal?))) + (if res (car res) res))) + (allowed-actions (string-split (or access-types "")))) + (print "Got " allowed-actions " for user " user " where access-types=" access-types " access-type=" access-type) + (cond + ((and access-types (member action allowed-actions)) + ;; (print "Access granted for " user " for " action) + #t) + (else + ;; (print "Access denied for " user " for " action) + #f)))))) + (define (get-pkts-dir mtconf) (let ((pktsdirs (configf:lookup mtconf "setup" "pktsdirs")) (pktsdir (if pktsdirs (car (string-split pktsdirs " ")) #f))) pktsdir)) (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.mtutilrc"))) - (if (file-exists? debugcontrolf) + (if (common:file-exists? debugcontrolf) (load debugcontrolf))) (if *action* (case (string->symbol *action*) ((run remove rerun set-ss archive kill list) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) + (area (args:get-arg "-area")) ;; look up the area to dispatch to from [areas] section + (areasec (if area (configf:lookup mtconf "areas" area) #f)) + (areadat (if areasec (val->alist areasec) #f)) + (area-path (if areadat (alist-ref 'path areadat) #f)) (pktsdirs (configf:lookup mtconf "setup" "pktsdirs")) (pktsdir (if pktsdirs (car (string-split pktsdirs " ")) #f)) - (adjargs (hash-table-copy args:arg-hash))) + (adjargs (hash-table-copy args:arg-hash)) + (new-ss (args:get-arg "-new"))) + ;; check a few things + (cond + ((and area (not area-path)) + (print "ERROR: the specified area was not found in the [areas] table. Area name=" area) + (exit 1)) + ((not area) + (print "ERROR: no area specified. Use -area ") + (exit 1)) + (else + (let ((user (current-user-name))) + (if (check-access user mtconf *action* area);; check rights + (print "Access granted for " *action* " action by " user) + (begin + (print "Access denied for " *action* " action by " user) + (exit 1)))))) + ;; (for-each ;; (lambda (key) ;; (if (not (member key *legal-params*)) ;; (hash-table-delete! adjargs key))) ;; we need to delete any params intended for mtutil ;; (hash-table-keys adjargs)) (let-values (((uuid pkt) - (command-line->pkt *action* adjargs #f))) + (command-line->pkt *action* adjargs #f area-path: area-path new-ss: new-ss))) (write-pkt pktsdir uuid pkt)))) ((dispatch import rungen process) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat)) (toppath (configf:lookup mtconf "scratchdat" "toppath"))) @@ -996,28 +1090,37 @@ (print "No section \"" (car remargs) "\" found"))) (print "ERROR: list requires section parameter; areas, setup or contours"))) ((gendot) (let* ((mtconfdat (simple-setup (args:get-arg "-start-dir"))) (mtconf (car mtconfdat))) - (with-queue-db + (common:with-queue-db mtconf (lambda (pktsdirs pktsdir conn) - (make-report "out.dot" conn '()))))) + ;; pktspec display-fields + (make-report "out.dot" conn + '((cmd . ((parent . P) + (user . M) + (target . t))) + (runstart . ((parent . P) + (target . t))) + (runtype . ((parent . P)))) ;; pktspec + '(P U t) ;; + ))))) ;; no ptypes listed (ptypes are strings of pkt types to read from db ((db) (if (null? remargs) (print "ERROR: missing sub command for db command") (let ((subcmd (car remargs))) (case (string->symbol subcmd) ((pgschema) (let* ((install-home (common:get-install-area)) (schema-file (conc install-home "/share/db/mt-pg.sql"))) - (if (file-exists? schema-file) + (if (common:file-exists? schema-file) (system (conc "/bin/cat " schema-file))))) ((sqlite3schema) (let* ((install-home (common:get-install-area)) (schema-file (conc install-home "/share/db/mt-sqlite3.sql"))) - (if (file-exists? schema-file) + (if (common:file-exists? schema-file) (system (conc "/bin/cat " schema-file))))) ((junk) (rmt:get-keys)))))))) ;; If HTTP_HOST is defined then we must be in the cgi environment Index: newdashboard.scm ================================================================== --- newdashboard.scm +++ newdashboard.scm @@ -84,11 +84,11 @@ (print help) (exit))) ;; ease debugging by loading ~/.dashboardrc (let ((debugcontrolf (conc (get-environment-variable "HOME") "/.dashboardrc"))) - (if (file-exists? debugcontrolf) + (if (common:file-exists? debugcontrolf) (load debugcontrolf))) (debug:setup) (define *tim* (iup:timer)) @@ -375,11 +375,11 @@ #f)) (define (test-panel window-id) (let* ((curr-row-num 0) (viewlog (lambda (x) - (if (file-exists? logfile) + (if (common:file-exists? logfile) ;(system (conc "firefox " logfile "&")) (iup:send-url logfile) (message-window (conc "File " logfile " not found"))))) (xterm (lambda (x) (if (directory-exists? rundir) Index: ods.scm ================================================================== --- ods.scm +++ ods.scm @@ -197,11 +197,11 @@ ;; '( (sheet1 (r1c1 r1c2 r1c3 ...) ;; (r2c1 r2c3 r2c3 ...) ) ;; (sheet2 ( ... ) ;; ( ... ) ) ) (define (ods:list->ods path fname data) - (if (not (file-exists? path)) + (if (not (common:file-exists? path)) (print "ERROR: path to create ods data must pre-exist") (begin (with-output-to-file (conc path "/content.xml") (lambda () (ods:construct-dir path) Index: portlogger.scm ================================================================== --- portlogger.scm +++ portlogger.scm @@ -19,11 +19,11 @@ ;; lsof -i (define (portlogger:open-db fname) (let* ((avail (tasks:wait-on-journal fname 5 remove: #t)) ;; wait up to about 10 seconds for the journal to go away - (exists (file-exists? fname)) + (exists (common:file-exists? fname)) (db (if avail (sqlite3:open-database fname) (begin (system (conc "rm -f " fname)) (sqlite3:open-database fname)))) @@ -57,11 +57,11 @@ (begin ;; (release-dot-lock fname) (debug:print-error 0 *default-log-port* "portlogger:open-run-close failed. " proc " " params) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* "exn=" (condition->list exn)) - (if (file-exists? fname)(delete-file fname)) ;; brutally get rid of it + (if (common:file-exists? fname)(delete-file fname)) ;; brutally get rid of it (print-call-chain (current-error-port))) (let* (;; (lock (obtain-dot-lock fname 2 9 10)) (db (portlogger:open-db fname)) (res (apply proc db params))) (sqlite3:finalize! db) Index: process.scm ================================================================== --- process.scm +++ process.scm @@ -142,11 +142,11 @@ (define (process:alive? pid) (handle-exceptions exn ;; possibly pid is a process not a child, look in /proc to see if it is running still - (file-exists? (conc "/proc/" pid)) + (common:file-exists? (conc "/proc/" pid)) (let-values (((rpid exit-type exit-signal)(process-wait pid #t))) (and (number? rpid) (equal? rpid pid))))) (define (process:alive-on-host? host pid) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -46,13 +46,19 @@ ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected + ;;DOT digraph megatest_state_status { + ;;DOT ranksep=0; + ;;DOT // rankdir=LR; + ;;DOT node [shape="box"]; + ;;DOT "rmt:send-receive" -> MUTEXLOCK; + ;;DOT { edge [style=invis];"case 1" -> "case 2" -> "case 3" -> "case 4" -> "case 5" -> "case 6" -> "case 7" -> "case 8" -> "case 9" -> "case 10" -> "case 11"; } ;; do all the prep locked under the rmt-mutex (mutex-lock! *rmt-mutex*) - + ;; 1. check if server is started IFF cmd is a write OR if we are not on the homehost, store in runremote ;; 2. check the age of the connections. refresh the connection if it is older than timeout-20 seconds. ;; 3. do the query, if on homehost use local access ;; (let* ((start-time (current-seconds)) ;; snapshot time so all use cases get same value @@ -69,114 +75,160 @@ (remote-ro-mode-set! runremote ro-mode) (remote-ro-mode-checked-set! runremote #t) ro-mode) ro-mode))))) - ;; ensure we have a record for our connection for given area - (if (not runremote) ;; can remove this one. should never get here. - (begin - (set! *runremote* (make-remote)) - (set! runremote *runremote*))) ;; new runremote will come from this on next iteration - ;; ensure we have a homehost record - (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost - (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little - (remote-hh-dat-set! runremote (common:get-homehost))) - - ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) + ;; DOT INIT_RUNREMOTE; // leaving off - doesn't really add to the clarity + ;; DOT MUTEXLOCK -> INIT_RUNREMOTE [label="no remote?"]; + ;; DOT INIT_RUNREMOTE -> MUTEXLOCK; + ;; ensure we have a record for our connection for given area + (if (not runremote) ;; can remove this one. should never get here. + (begin + (set! *runremote* (make-remote)) + (set! runremote *runremote*))) ;; new runremote will come from this on next iteration + + ;; DOT SET_HOMEHOST; // leaving off - doesn't really add to the clarity + ;; DOT MUTEXLOCK -> SET_HOMEHOST [label="no homehost?"]; + ;; DOT SET_HOMEHOST -> MUTEXLOCK; + ;; ensure we have a homehost record + (if (not (pair? (remote-hh-dat runremote))) ;; not on homehost + (thread-sleep! 0.1) ;; since we shouldn't get here, delay a little + (remote-hh-dat-set! runremote (common:get-homehost))) + + ;;(print "BB> readonly-mode is "readonly-mode" dbfile is "dbfile) (cond + ;;DOT EXIT; + ;;DOT MUTEXLOCK -> EXIT [label="> 15 attempts"]; {rank=same "case 1" "EXIT" } ;; give up if more than 15 attempts ((> attemptnum 15) (debug:print 0 *default-log-port* "ERROR: 15 tries to start/connect to server. Giving up.") (exit 1)) - ;; readonly mode, read request- handle it - case 20 + ;;DOT CASE2 [label="local\nreadonly\nquery"]; + ;;DOT MUTEXLOCK -> CASE2; {rank=same "case 2" CASE2} + ;;DOT CASE2 -> "rmt:open-qry-close-locally"; + ;; readonly mode, read request- handle it - case 2 ((and readonly-mode (member cmd api:read-only-queries)) (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 20") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 2") (rmt:open-qry-close-locally cmd 0 params) ) + ;;DOT CASE3 [label="write in\nread-only mode"]; + ;;DOT MUTEXLOCK -> CASE3 [label="readonly\nmode?"]; {rank=same "case 3" CASE3} + ;;DOT CASE3 -> "#f"; ;; readonly mode, write request. Do nothing, return #f (readonly-mode (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 21") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") (debug:print 0 *default-log-port* "WARNING: write transaction requested on a readonly area. cmd="cmd" params="params) - #f - ) - - ;; reset the connection if it has been unused too long - ((and runremote - (remote-conndat runremote) - (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #.\n message: Server closed connection before sending response" - (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) - (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") - (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. - (mutex-unlock! *rmt-mutex*) - (rmt:send-receive cmd rid params attemptnum: attemptnum)) + #f) + + ;; This block was for pre-emptively resetting the connection if there had been no communication for some time. + ;; I don't think it adds any value. If the server is not there, just fail and start a new connection. + ;; also, the expire-time calculation might not be correct. We want, time-since-last-server-access > (server:get-timeout) + ;; + ;; ;;DOT CASE4 [label="reset\nconnection"]; + ;; ;;DOT MUTEXLOCK -> CASE4 [label="have connection,\nlast_access > expire_time"]; {rank=same "case 4" CASE4} + ;; ;;DOT CASE4 -> "rmt:send-receive"; + ;; ;; reset the connection if it has been unused too long + ;; ((and runremote + ;; (remote-conndat runremote) + ;; (let ((expire-time (+ (- start-time (remote-server-timeout runremote))(random 10)))) ;; Subtract or add the random value? Seems like it should be substract but Neither fixes the "WARNING: failure in with-input-from-request to #.\n message: Server closed connection before sending response" + ;; (< (http-transport:server-dat-get-last-access (remote-conndat runremote)) expire-time))) + ;; (debug:print-info 0 *default-log-port* "Connection to " (remote-server-url runremote) " expired due to no accesses, forcing new connection.") + ;; (http-transport:close-connections area-dat: runremote) + ;; (remote-conndat-set! runremote #f) ;; invalidate the connection, thus forcing a new connection. + ;; (mutex-unlock! *rmt-mutex*) + ;; (rmt:send-receive cmd rid params attemptnum: attemptnum)) + + ;;DOT CASE5 [label="local\nread"]; + ;;DOT MUTEXLOCK -> CASE5 [label="server not required,\non homehost,\nread-only query"]; {rank=same "case 5" CASE5}; + ;;DOT CASE5 -> "rmt:open-qry-close-locally"; ;; on homehost and this is a read - ((and (not (remote-force-server runremote)) ;; honor forced use of server - (cdr (remote-hh-dat runremote)) ;; on homehost - (member cmd api:read-only-queries)) ;; this is a read + ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required + (cdr (remote-hh-dat runremote)) ;; on homehost + (member cmd api:read-only-queries)) ;; this is a read (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 3") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") (rmt:open-qry-close-locally cmd 0 params)) + ;;DOT CASE6 [label="init\nremote"]; + ;;DOT MUTEXLOCK -> CASE6 [label="on homehost,\nwrite query,\nhave server,\ncan't reach it"]; {rank=same "case 6" CASE6}; + ;;DOT CASE6 -> "rmt:send-receive"; ;; on homehost and this is a write, we already have a server, but server has died ((and (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url runremote) ;; have a server (not (server:ping (remote-server-url runremote)))) ;; server has died. NOTE: this is not a cheap call! Need better approach. (set! *runremote* (make-remote)) + (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6") (rmt:send-receive cmd rid params attemptnum: attemptnum)) + ;;DOT CASE7 [label="homehost\nwrite"]; + ;;DOT MUTEXLOCK -> CASE7 [label="server not required,\non homehost,\na write,\nhave a server"]; {rank=same "case 7" CASE7}; + ;;DOT CASE7 -> "rmt:open-qry-close-locally"; ;; on homehost and this is a write, we already have a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server + ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required (cdr (remote-hh-dat runremote)) ;; on homehost (not (member cmd api:read-only-queries)) ;; this is a write (remote-server-url runremote)) ;; have a server (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 4.1") (rmt:open-qry-close-locally cmd 0 params)) + ;;DOT CASE8 [label="force\nserver"]; + ;;DOT MUTEXLOCK -> CASE8 [label="server not required,\nhave homehost info,\nno connection yet,\nnot a read-only query"]; {rank=same "case 8" CASE8}; + ;;DOT CASE8 -> "rmt:open-qry-close-locally"; ;; on homehost, no server contact made and this is a write, passively start a server - ((and (not (remote-force-server runremote)) ;; honor forced use of server - (cdr (remote-hh-dat runremote)) ;; new - (not (remote-server-url runremote)) - (not (member cmd api:read-only-queries))) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5") + ((and (not (remote-force-server runremote)) ;; honor forced use of server, i.e. server NOT required + (cdr (remote-hh-dat runremote)) ;; have homehost + (not (remote-server-url runremote)) ;; no connection yet + (not (member cmd api:read-only-queries))) ;; not a read-only query + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8") (let ((server-url (server:check-if-running *toppath*))) ;; (server:read-dotserver->url *toppath*))) ;; (server:check-if-running *toppath*))) ;; Do NOT want to run server:check-if-running - very expensive to do for every write call (if server-url (remote-server-url-set! runremote server-url) ;; the string can be consumed by the client setup if needed (if (common:force-server?) (server:start-and-wait *toppath*) (server:kind-run *toppath*)))) (remote-force-server-set! runremote (common:force-server?)) (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 5.1") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 8.1") (rmt:open-qry-close-locally cmd 0 params)) + ;;DOT CASE9 [label="force server\nnot on homehost"]; + ;;DOT MUTEXLOCK -> CASE9 [label="no connection\nand either require server\nor not on homehost"]; {rank=same "case 9" CASE9}; + ;;DOT CASE9 -> "start\nserver" -> "rmt:send-receive"; ((or (and (remote-force-server runremote) ;; we are forcing a server and don't yet have a connection to one (not (remote-conndat runremote))) (and (not (cdr (remote-hh-dat runremote))) ;; not on a homehost (not (remote-conndat runremote)))) ;; and no connection - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 6 hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9, hh-dat: " (remote-hh-dat runremote) " conndat: " (remote-conndat runremote)) (mutex-unlock! *rmt-mutex*) (if (not (server:check-if-running *toppath*)) ;; who knows, maybe one has started up? (server:start-and-wait *toppath*)) - (remote-force-server-set! runremote (common:force-server?)) (remote-conndat-set! runremote (rmt:get-connection-info *toppath*)) ;; calls client:setup which calls client:setup-http (rmt:send-receive cmd rid params attemptnum: attemptnum)) ;; TODO: add back-off timeout as + + ;;DOT CASE10 [label="on homehost"]; + ;;DOT MUTEXLOCK -> CASE10 [label="server not required,\non homehost"]; {rank=same "case 10" CASE10}; + ;;DOT CASE10 -> "rmt:open-qry-close-locally"; ;; all set up if get this far, dispatch the query ((and (not (remote-force-server runremote)) (cdr (remote-hh-dat runremote))) ;; we are on homehost (mutex-unlock! *rmt-mutex*) - (debug:print-info 12 *default-log-port* "rmt:send-receive, case 7") + (debug:print-info 12 *default-log-port* "rmt:send-receive, case 10") (rmt:open-qry-close-locally cmd (if rid rid 0) params)) + ;;DOT CASE11 [label="send_receive"]; + ;;DOT MUTEXLOCK -> CASE11 [label="else"]; {rank=same "case 11" CASE11}; + ;;DOT CASE11 -> "rmt:send-receive" [label="call failed"]; + ;;DOT CASE11 -> "RESULT" [label="call succeeded"]; ;; not on homehost, do server query (else (mutex-unlock! *rmt-mutex*) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9") (mutex-lock! *rmt-mutex*) @@ -200,24 +252,28 @@ (eq? (vector-length res) 2) (eq? (vector-ref res 1) 'overloaded)) ;; since we are looking at the data to carry the error we'll use a fairly obtuse combo to minimise the chances of some sort of collision. (let ((wait-delay (+ attemptnum (* attemptnum 10)))) (debug:print 0 *default-log-port* "WARNING: server is overloaded. Delaying " wait-delay " seconds and trying call again.") (mutex-lock! *rmt-mutex*) + (http-transport:close-connections area-dat: runremote) (set! *runremote* #f) ;; force starting over (mutex-unlock! *rmt-mutex*) (thread-sleep! wait-delay) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))) res) ;; All good, return res (begin (debug:print 0 *default-log-port* "WARNING: communication failed. Trying again, try num: " attemptnum) (remote-conndat-set! runremote #f) + (http-transport:close-connections area-dat: runremote) (remote-server-url-set! runremote #f) (debug:print-info 12 *default-log-port* "rmt:send-receive, case 9.1") - (if (not (server:check-if-running *toppath*)) - (server:start-and-wait *toppath*)) + ;; (if (not (server:check-if-running *toppath*)) + ;; (server:start-and-wait *toppath*)) (rmt:send-receive cmd rid params attemptnum: (+ attemptnum 1))))))))) + ;;DOT } + ;; (define (rmt:update-db-stats run-id rawcmd params duration) ;; (mutex-lock! *db-stats-mutex*) ;; (handle-exceptions ;; exn ;; (begin @@ -472,16 +528,16 @@ (define (rmt:set-tests-state-status run-id testnames currstate currstatus newstate newstatus) (rmt:send-receive 'set-tests-state-status run-id (list run-id testnames currstate currstatus newstate newstatus))) (define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) - (if (number? run-id) - (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode)) - (begin - (debug:print-error 0 *default-log-port* "rmt:get-tests-for-run called with bad run-id=" run-id) - (print-call-chain (current-error-port)) - '()))) + ;; (if (number? run-id) + (rmt:send-receive 'get-tests-for-run run-id (list run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode))) + ;; (begin + ;; (debug:print-error 0 *default-log-port* "rmt:get-tests-for-run called with bad run-id=" run-id) + ;; (print-call-chain (current-error-port)) + ;; '()))) ;; get stuff via synchash (define (rmt:synchash-get run-id proc synckey keynum params) (rmt:send-receive 'synchash-get run-id (list run-id proc synckey keynum params))) @@ -753,10 +809,13 @@ ;; T E S T D A T A ;;====================================================================== (define (rmt:read-test-data run-id test-id categorypatt #!key (work-area #f)) (rmt:send-receive 'read-test-data run-id (list run-id test-id categorypatt))) +(define (rmt:read-test-data* run-id test-id categorypatt varpatt #!key (work-area #f)) + (rmt:send-receive 'read-test-data* run-id (list run-id test-id categorypatt varpatt))) + (define (rmt:testmeta-add-record testname) (rmt:send-receive 'testmeta-add-record #f (list testname))) (define (rmt:testmeta-get-record testname) @@ -784,10 +843,23 @@ (define (rmt:tasks-set-state-given-param-key param-key new-state) (rmt:send-receive 'tasks-set-state-given-param-key #f (list param-key new-state))) (define (rmt:tasks-get-last target runname) (rmt:send-receive 'tasks-get-last #f (list target runname))) + +;;====================================================================== +;; N O S Y N C D B +;;====================================================================== + +(define (rmt:no-sync-set var val) + (rmt:send-receive 'no-sync-set #f `(,var ,val))) + +(define (rmt:no-sync-get/default var default) + (rmt:send-receive 'no-sync-get/default #f `(,var ,default))) + +(define (rmt:no-sync-del! var) + (rmt:send-receive 'no-sync-del! #f `(,var))) ;;====================================================================== ;; A R C H I V E S ;;====================================================================== Index: runconfig.scm ================================================================== --- runconfig.scm +++ runconfig.scm @@ -73,11 +73,11 @@ (let ((runconfigf (conc *toppath* "/runconfigs.config")) (targ (or (common:args-get-target) targ-from-db (get-environment-variable "MT_TARGET")))) (pop-directory) - (if (file-exists? runconfigf) + (if (common:file-exists? runconfigf) (setup-env-defaults runconfigf run-id #t keyvals environ-patt: (conc "(default" (if targ (conc "|" targ ")") ")"))) Index: runconfigs.config ================================================================== --- runconfigs.config +++ runconfigs.config @@ -7,12 +7,12 @@ # # example of a cron entry to run sync using db spec pgdb, with pgdb setting in file local.config # [a/b/c] -all:scheduled:sync cron= 0/5 * * * *;dbdest=pgdb;appendconf=/mfs/matt/.sysmaint/local.config -# quick:scheduled:sync cron= 0/5 * * * *;dbdest=pgdb;appendconf=/mfs/matt/.sysmaint/local.config +all:scheduled:sync cron= 0/5 * * * *;dbdest=pgdb;appendconf=/nfs/phoebe/disk1/home/mfs/matt/.sysmaint/local.config +# quick:scheduled:sync cron= 0/5 * * * *;dbdest=pgdb;appendconf=/nfs/phoebe/disk1/home/mfs/matt/.sysmaint/local.config # fast:scheduled:sync-prepend cron= 0/1 * * * *;dbdest=pgdb;appendconf=/mfs/matt/.sysmaint/local.config [scriptinc ./gentargets.sh #{getenv USER}] # [v1.23/45/67] Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -330,11 +330,11 @@ ;; force the starting of a server (debug:print 0 *default-log-port* "waiting on server...") (server:start-and-wait *toppath*) (runs:set-megatest-env-vars run-id inkeys: keys inrunname: runname) ;; these may be needed by the launching process - (set! runconf (if (file-exists? runconfigf) + (set! runconf (if (common:file-exists? runconfigf) (setup-env-defaults runconfigf run-id *already-seen-runconfig-info* keyvals target) (begin (debug:print 0 *default-log-port* "WARNING: You do not have a run config file: " runconfigf) #f))) @@ -513,11 +513,17 @@ (let ((reglen (configf:lookup *configdat* "setup" "runqueue"))) (if (> (length (hash-table-keys test-records)) 0) (let* ((keep-going #t) (run-queue-retries 5) (th1 (make-thread (lambda () - (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry)) + (handle-exceptions + exn + (begin + (print-call-chain) + (print " message: " ((condition-property-accessor 'exn 'message) exn))) + (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests + (any->number reglen) all-tests-registry))) ;; (handle-exceptions ;; exn ;; (begin ;; (print-call-chain (current-error-port)) ;; (debug:print-error 0 *default-log-port* "failure in runs:run-tests-queue thread, error: " ((condition-property-accessor 'exn 'message) exn)) @@ -834,12 +840,13 @@ (not (equal? x hed))) (runs:calc-not-completed prereqs-not-met))) (loop-list (list hed tal reg reruns)) ;; configure the load runner (numcpus (common:get-num-cpus #f)) - (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3"))) - (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) + (maxload (string->number (or (configf:lookup *configdat* "jobtools" "maxload") "3.0"))) ;; use a non-number string to disable + (maxhomehostload (string->number (or (configf:lookup *configdat* "jobtools" "maxhomehostload") "1.2"))) ;; use a non-number string to disable + (waitdelay (string->number (or (configf:lookup *configdat* "jobtools" "waitdelay") "60")))) (debug:print-info 4 *default-log-port* "have-resources: " have-resources " prereqs-not-met: (" (string-intersperse (map (lambda (t) (if (vector? t) (conc (db:test-get-state t) "/" (db:test-get-status t)) @@ -939,12 +946,15 @@ ;; we are going to reset all the counters for test retries by setting a new hash table ;; this means they will increment only when nothing can be run (set! *max-tries-hash* (make-hash-table)) ;; well, first lets see if cpu load throttling is enabled. If so wait around until the ;; average cpu load is under the threshold before continuing - (if (configf:lookup *configdat* "jobtools" "maxload") ;; only gate if maxload is specified - (common:wait-for-cpuload maxload numcpus waitdelay)) + (if maxload ;; only gate if maxload is specified + (common:wait-for-cpuload maxload numcpus waitdelay)) + (if maxhomehostload + (common:wait-for-homehost-load maxhomehostload (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))) + (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry) (runs:incremental-print-results run-id) (hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running) (runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?) ;; (thread-sleep! *global-delta*) @@ -1262,11 +1272,14 @@ ))) (runs:dat-regfull-set! runsdat regfull) ;; every 15 minutes verify the server is there for this run (if (and (common:low-noise-print 240 "try start server" run-id) - (not (server:check-if-running *toppath*))) + (not (or (and *runremote* + (remote-server-url *runremote*) + (server:ping (remote-server-url *runremote*))) + (server:check-if-running *toppath*)))) (server:kind-run *toppath*)) (if (> num-running 0) (set! last-time-some-running (current-seconds))) @@ -1576,11 +1589,11 @@ (thread-sleep! 1) (loop))))) (if (not testdat) ;; should NOT happen (debug:print-error 0 *default-log-port* "failed to get test record for test-id " test-id)) (set! test-id (db:test-get-id testdat)) - (if (file-exists? test-path) + (if (common:file-exists? test-path) (change-directory test-path) (begin (debug:print-error 0 *default-log-port* "test run path not created before attempting to run the test. Perhaps you are running -remove-runs at the same time?") (change-directory *toppath*))) (case (if force ;; (args:get-arg "-force") @@ -1906,20 +1919,21 @@ ((archive) (if (and run-dir (not toplevel-with-children)) (let ((ddir (conc run-dir "/"))) (case (string->symbol (args:get-arg "-archive")) ((save save-remove keep-html) - (if (file-exists? ddir) + (if (common:file-exists? ddir) (debug:print-info 0 *default-log-port* "Estimating disk space usage for " test-fulln ": " (common:get-disk-space-used ddir))))))) (if (not (null? tal)) (loop (car tal)(cdr tal)))) ))) ) (if worker-thread (thread-join! worker-thread)))))) ;; remove the run if zero tests remain (if (eq? action 'remove-runs) - (let ((remtests (mt:get-tests-for-run (db:get-value-by-header run header "id") #f '("DELETED") '("n/a") not-in: #t))) + (let* ((run-id (db:get-value-by-header run header "id")) ;; NB// masks run-id from above? + (remtests (mt:get-tests-for-run run-id #f '("DELETED") '("n/a") not-in: #t))) (if (null? remtests) ;; no more tests remaining (let* ((dparts (string-split lasttpath "/")) (runpath (conc "/" (string-intersperse (take dparts (- (length dparts) 1)) "/")))) @@ -1939,11 +1953,11 @@ ) #t) (define (runs:remove-test-directory test mode) ;; remove-data-only) (let* ((run-dir (db:test-get-rundir test)) ;; run dir is from the link tree - (real-dir (if (file-exists? run-dir) + (real-dir (if (common:file-exists? run-dir) ;; (resolve-pathname run-dir) (common:nice-path run-dir) #f))) (case mode ((remove-data-only)(mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "CLEANING" "LOCKED" #f)) @@ -1950,14 +1964,14 @@ ((remove-all) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "REMOVING" "LOCKED" #f)) ((archive-remove) (mt:test-set-state-status-by-id (db:test-get-run_id test)(db:test-get-id test) "ARCHIVE_REMOVING" #f #f))) (debug:print-info 1 *default-log-port* "Attempting to remove " (if real-dir (conc " dir " real-dir " and ") "") " link " run-dir) (if (and real-dir (> (string-length real-dir) 5) - (file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. + (common:file-exists? real-dir)) ;; bad heuristic but should prevent /tmp /home etc. (begin ;; let* ((realpath (resolve-pathname run-dir))) (debug:print-info 1 *default-log-port* "Recursively removing " real-dir) - (if (file-exists? real-dir) + (if (common:file-exists? real-dir) (runs:safe-delete-test-dir real-dir) (debug:print 0 *default-log-port* "WARNING: test dir " real-dir " appears to not exist or is not readable"))) (if real-dir (debug:print 0 *default-log-port* "WARNING: directory " real-dir " does not exist") (debug:print 0 *default-log-port* "WARNING: no real directory corrosponding to link " run-dir ", nothing done"))) @@ -2006,11 +2020,12 @@ (let (;; (db #f) (keys #f)) (if (launch:setup) (begin (full-runconfigs-read) ;; cache the run config - (launch:cache-config)) ;; do not cache here - need to be sure runconfigs is processed + ;; (launch:cache-config) ;; there are two independent config cache locations, turning this one off for now. MRW. + ) ;; do not cache here - need to be sure runconfigs is processed (begin (debug:print 0 *default-log-port* "Failed to setup, exiting") (exit 1))) @@ -2167,11 +2182,11 @@ (define (runs:clean-cache target runname toppath) (if target (if runname (let* ((linktree (common:get-linktree)) ;; (if toppath (configf:lookup *configdat* "setup" "linktree"))) (runtop (conc linktree "/" target "/" runname)) - (files (if (file-exists? runtop) + (files (if (common:file-exists? runtop) (append (glob (conc runtop "/.megatest*")) (glob (conc runtop "/.runconfig*"))) '()))) (if (null? files) (debug:print-info 0 *default-log-port* "No cached megatest or runconfigs files found. None removed.") Index: sdb.scm ================================================================== --- sdb.scm +++ sdb.scm @@ -22,11 +22,11 @@ (declare (unit sdb)) ;; (define (sdb:open fname) (let* ((dbpath (pathname-directory fname)) - (dbexists (let ((fe (file-exists? fname))) + (dbexists (let ((fe (common:file-exists? fname))) (if fe fe (begin (create-directory dbpath #t) #f)))) Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -210,11 +210,25 @@ (if (null? tal) (if (and limit (> (length new-res) limit)) new-res ;; (take new-res limit) <= need intelligent sorting before this will work new-res) - (loop (car tal)(cdr tal) new-res))))))))) + (loop (car tal)(cdr tal) new-res))))))))) + +(define (server:get-num-alive srvlst) + (let ((num-alive 0)) + (for-each + (lambda (server) + (match-let (((mod-time host port start-time pid) + server)) + (let* ((uptime (- (current-seconds) mod-time)) + (runtime (if start-time + (- mod-time start-time) + 0))) + (if (< uptime 5)(set! num-alive (+ num-alive 1)))))) + srvlst) + num-alive)) ;; given a list of servers get a list of valid servers, i.e. at least ;; 10 seconds old, has started and is less than 1 hour old and is ;; active (i.e. mod-time < 10 seconds ;; @@ -452,5 +466,79 @@ (string-intersperse (map number->string (u8vector->list (if res res (hostname->ip hostname)))) "."))) +;; moving this here as it needs access to db and cannot be in common. +;; +(define (server:writable-watchdog dbstruct) + (thread-sleep! 0.05) ;; delay for startup + (let ((legacy-sync (common:run-sync?)) + (debug-mode (debug:debug-mode 1)) + (last-time (current-seconds)) + (no-sync-db (db:open-no-sync-db)) + (this-wd-num (begin (mutex-lock! *wdnum*mutex) (let ((x *wdnum*)) (set! *wdnum* (add1 *wdnum*)) (mutex-unlock! *wdnum*mutex) x)))) + (set! *no-sync-db* no-sync-db) ;; make the no sync db available to api calls + (debug:print-info 2 *default-log-port* "Periodic sync thread started.") + (debug:print-info 3 *default-log-port* "watchdog starting. legacy-sync is " legacy-sync" pid="(current-process-id)" this-wd-num="this-wd-num) + (if (and legacy-sync (not *time-to-exit*)) + (let* (;;(dbstruct (db:setup)) + (mtdb (dbr:dbstruct-mtdb dbstruct)) + (mtpath (db:dbdat-get-path mtdb))) + (debug:print-info 0 *default-log-port* "Server running, periodic sync started.") + (let loop () + ;; sync for filesystem local db writes + ;; + (mutex-lock! *db-multi-sync-mutex*) + (let* ((need-sync (>= *db-last-access* *db-last-sync*)) ;; no sync since last write + (sync-in-progress *db-sync-in-progress*) + (should-sync (and (not *time-to-exit*) + (> (- (current-seconds) *db-last-sync*) 5))) ;; sync every five seconds minimum + (start-time (current-seconds)) + (mt-mod-time (file-modification-time mtpath)) + (recently-synced (< (- start-time mt-mod-time) 4)) + (will-sync (and (or need-sync should-sync) + (not sync-in-progress) + (not recently-synced)))) + (debug:print-info 13 *default-log-port* "WD writable-watchdog top of loop. need-sync="need-sync" sync-in-progress="sync-in-progress" should-sync="should-sync" start-time="start-time" mt-mod-time="mt-mod-time" recently-synced="recently-synced" will-sync="will-sync) + ;; (if recently-synced (debug:print-info 0 *default-log-port* "Skipping sync due to recently-synced flag=" recently-synced)) + ;; (debug:print-info 0 *default-log-port* "need-sync: " need-sync " sync-in-progress: " sync-in-progress " should-sync: " should-sync " will-sync: " will-sync) + (if will-sync (set! *db-sync-in-progress* #t)) + (mutex-unlock! *db-multi-sync-mutex*) + (if will-sync + (let ((res (db:sync-to-megatest.db dbstruct no-sync-db: no-sync-db))) ;; did we sync any data? If so need to set the db touched flag to keep the server alive + (if (> res 0) ;; some records were transferred, keep the db alive + (begin + (mutex-lock! *heartbeat-mutex*) + (set! *db-last-access* (current-seconds)) + (mutex-unlock! *heartbeat-mutex*) + (debug:print-info 0 *default-log-port* "sync called, " res " records transferred.")) + (debug:print-info 2 *default-log-port* "sync called but zero records transferred")))) + (if will-sync + (begin + (mutex-lock! *db-multi-sync-mutex*) + (set! *db-sync-in-progress* #f) + (set! *db-last-sync* start-time) + (mutex-unlock! *db-multi-sync-mutex*))) + (if (and debug-mode + (> (- start-time last-time) 60)) + (begin + (set! last-time start-time) + (debug:print-info 4 *default-log-port* "timestamp -> " (seconds->time-string (current-seconds)) ", time since start -> " (seconds->hr-min-sec (- (current-seconds) *time-zero*)))))) + + ;; keep going unless time to exit + ;; + (if (not *time-to-exit*) + (let delay-loop ((count 0)) + ;;(debug:print-info 13 *default-log-port* "delay-loop top; count="count" pid="(current-process-id)" this-wd-num="this-wd-num" *time-to-exit*="*time-to-exit*) + + (if (and (not *time-to-exit*) + (< count 6)) ;; was 11, changing to 4. + (begin + (thread-sleep! 1) + (delay-loop (+ count 1)))) + (if (not *time-to-exit*) (loop)))) + ;; time to exit, close the no-sync db here + (db:no-sync-close-db no-sync-db) + (if (common:low-noise-print 30) + (debug:print-info 0 *default-log-port* "Exiting watchdog timer, *time-to-exit* = " *time-to-exit*" pid="(current-process-id)" this-wd-num="this-wd-num))))))) + Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -39,21 +39,21 @@ (print-call-chain (current-error-port)) (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* " exn=" (condition->list exn)) (debug:print 0 *default-log-port* "tasks:wait-on-journal failed. Continuing on, you can ignore this call-chain") #t) ;; if stuff goes wrong just allow it to move on - (let loop ((journal-exists (file-exists? fullpath)) + (let loop ((journal-exists (common:file-exists? fullpath)) (count n)) ;; wait ten times ... (if journal-exists (begin (if (and waiting-msg (eq? (modulo n 30) 0)) (debug:print 0 *default-log-port* waiting-msg)) (if (> count 0) (begin (thread-sleep! 1) - (loop (file-exists? fullpath) + (loop (common:file-exists? fullpath) (- count 1))) (begin (debug:print 0 *default-log-port* "ERROR: removing the journal file " fullpath ", this is not good. Look for disk full, write access and other issues.") (if remove (system (conc "rm -rf " fullpath))) #f))) @@ -97,11 +97,11 @@ (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* " exn=" (condition->list exn)))) (let* ((dbpath (db:dbfile-path )) ;; (tasks:get-task-db-path)) (dbfile (conc dbpath "/monitor.db")) (avail (tasks:wait-on-journal dbpath 10)) ;; wait up to about 10 seconds for the journal to go away - (exists (file-exists? dbpath)) + (exists (common:file-exists? dbpath)) (write-access (file-write-access? dbpath)) (mdb (cond ;; what the hek is *toppath* doing here? ((and (string? *toppath*)(file-write-access? *toppath*)) (sqlite3:open-database dbfile)) ((file-read-access? dbpath) (sqlite3:open-database dbfile)) @@ -194,12 +194,12 @@ (system (conc "nbfake kill "kill-switch" "pid)) (when logfile (thread-sleep! 0.5) - (if (file-exists? gzfile) (delete-file gzfile)) - (system (conc "gzip "logfile)) + (if (common:file-exists? gzfile) (delete-file gzfile)) + (system (conc "gzip " logfile)) (unsetenv "TARGETHOST_LOGF") (unsetenv "TARGETHOST")))) ADDED tcmt.scm Index: tcmt.scm ================================================================== --- /dev/null +++ tcmt.scm @@ -0,0 +1,172 @@ +;; Copyright 2006-2017, Matthew Welland. +;; +;; This program is made available under the GNU GPL version 2.0 or +;; greater. See the accompanying file COPYING for details. +;; +;; This program is distributed WITHOUT ANY WARRANTY; without even the +;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +;; PURPOSE. +;; +;;====================================================================== +;; +;; Wrapper to enable running Megatest flows under teamcity +;; +;; 1. Run the megatest process and pass it all the needed parameters +;; 2. Every five seconds check for state/status changes and print the info +;; + +(use srfi-1 posix srfi-69 srfi-18 regex) + +(declare (uses margs)) +(declare (uses rmt)) +(declare (uses common)) +(declare (uses megatest-version)) + +(include "megatest-fossil-hash.scm") +(include "db_records.scm") + +(define origargs (cdr (argv))) +(define remargs (args:get-args + (argv) + `( "-target" + "-reqtarg" + "-runname" + ) + `("-tc-repl" + ) + args:arg-hash + 0)) + +;; ##teamcity[testStarted name='suite.testName'] +;; ##teamcity[testStdOut name='suite.testName' out='text'] +;; ##teamcity[testStdErr name='suite.testName' out='error text'] +;; ##teamcity[testFailed name='suite.testName' message='failure message' details='message and stack trace'] +;; ##teamcity[testFinished name='suite.testName' duration='50'] +;; + +(define (print-changes-since data run-ids last-update tsname target runname) + (let ((now (current-seconds))) + (handle-exceptions + exn + (begin (print-call-chain) (print "Error message: " ((condition-property-accessor 'exn 'message) exn))) + (for-each + (lambda (run-id) + (let* ((tests (rmt:get-tests-for-run run-id "%" '() '() #f #f #f #f #f #f last-update #f))) + ;; (print "DEBUG: got tests=" tests) + (for-each + (lambda (testdat) + (let* ((testn (db:test-get-fullname testdat)) + (testname (db:test-get-testname testdat)) + (itempath (db:test-get-item-path testdat)) + (tctname (if (string=? itempath "") testname (conc testname "." (string-translate itempath "/" ".")))) + (state (db:test-get-state testdat)) + (status (db:test-get-status testdat)) + (duration (or (any->number (db:test-get-run_duration testdat)) 0)) + (comment (db:test-get-comment testdat)) + (logfile (db:test-get-final_logf testdat)) + (prevstat (hash-table-ref/default data testn #f)) + (newstat (if (equal? state "RUNNING") + "RUNNING" + (if (equal? state "COMPLETED") + status + "UNK"))) + (cmtstr (if comment + (conc " message='" comment "' ") + " ")) + (details (if (string-match ".*html$" logfile) + (conc " details='" *toppath* "/lt/" target "/" runname "/" testname (if (equal? itempath "") "/" (conc "/" itempath "/")) logfile "' ") + ""))) + + ;; (print "DEBUG: testn=" testn " state=" state " status=" status " prevstat=" prevstat " newstat=" newstat) + (if (or (not prevstat) + (not (equal? prevstat newstat))) + (begin + (case (string->symbol newstat) + ((UNK) ) ;; do nothing + ((RUNNING) (print "##teamcity[testStarted name='" tctname "']")) + ((PASS SKIP) (print "##teamcity[testFinished name='" tctname "' duration='" (* 1e3 duration) "'" cmtstr details " ]")) + (else + (print "##teamcity[testFailed name='" tctname "' " cmtstr details " ]"))) + (flush-output) + (hash-table-set! data testn newstat))))) + tests))) + run-ids)) + now)) + +(define (monitor pid) + (let ((run-ids #f) + (testdat (make-hash-table)) + (keys #f) + (last-update 0) + (target (or (args:get-arg "-target") + (args:get-arg "-reqtarg"))) + (runname (args:get-arg "-runname")) + (tsname #f)) + (if (and target runname) + (begin + (launch:setup) + (set! keys (rmt:get-keys)))) + (set! tsname (common:get-testsuite-name)) + (print "TCMT: for testsuite=" tsname " found runname=" runname ", target=" target ", keys=" keys " and successfully ran launch:setup.") + (let loop () + (handle-exceptions + exn + ;; (print "Process done.") + (begin (print-call-chain) (print "Error message: " ((condition-property-accessor 'exn 'message) exn))) + (let-values (((pidres exittype exitstatus) + (process-wait pid #t))) + (if (and keys + (or (not run-ids) + (null? run-ids))) + (let* ((runs (rmt:get-runs-by-patt keys + runname + target + #f ;; offset + #f ;; limit + #f ;; fields + 0 ;; last-update + )) + (header (db:get-header runs)) + (rows (db:get-rows runs)) + (run-ids-in (map (lambda (row) + (db:get-value-by-header row header "id")) + rows))) + (set! run-ids run-ids-in))) + ;; (print "TCMT: pidres=" pidres " exittype=" exittype " exitstatus=" exitstatus " run-ids=" run-ids) + (if keys + (set! last-update (print-changes-since testdat run-ids last-update tsname target runname))) + (if (eq? pidres 0) + (begin + (thread-sleep! 3) + (loop)) + (begin + ;; (print "TCMT: pidres=" pidres " exittype=" exittype " exitstatus=" exitstatus " run-ids=" run-ids) + (print "TCMT: All done.") + ))))))) + +;; (if (not (eq? pidres 0)) ;; (not exitstatus)) +;; (begin +;; (thread-sleep! 3) +;; (loop)) +;; (print "Process: megatest " (string-intersperse origargs " ") " is done."))))) +(define (main) + (let* ((mt-done #f) + (pid #f) + (th1 (make-thread (lambda () + (print "Running megatest " (string-intersperse origargs " ")) + (set! pid (process-run "megatest" origargs))) + "Megatest job")) + (th2 (make-thread (lambda () + (monitor pid)) + "Monitor job"))) + (thread-start! th1) + (thread-sleep! 1) ;; give the process time to get going + (thread-start! th2) + (thread-join! th2))) + +(if (args:get-arg "-tc-repl") + (repl) + (main)) + +;; (process-wait) + Index: tdb.scm ================================================================== --- tdb.scm +++ tdb.scm @@ -50,11 +50,11 @@ (debug:print-info 11 *default-log-port* "open-test-db " work-area) (if (and work-area (directory? work-area) (file-read-access? work-area)) (let* ((dbpath (conc work-area "/testdat.db")) - (dbexists (file-exists? dbpath)) + (dbexists (common:file-exists? dbpath)) (work-area-writeable (file-write-access? work-area)) (db (handle-exceptions ;; open the db if area writeable or db pre-existing. open in-mem otherwise. if exception, open in-mem exn (begin (print-call-chain (current-error-port)) Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -61,16 +61,16 @@ (define (tests:get-valid-tests test-registry tests-paths) (if (null? tests-paths) test-registry (let loop ((hed (car tests-paths)) (tal (cdr tests-paths))) - (if (file-exists? hed) + (if (common:file-exists? hed) (for-each (lambda (test-path) (let* ((tname (last (string-split test-path "/"))) (tconfig (conc test-path "/testconfig"))) (if (and (not (hash-table-ref/default test-registry tname #f)) - (file-exists? tconfig)) + (common:file-exists? tconfig)) (hash-table-set! test-registry tname test-path)))) (glob (conc hed "/*")))) (if (null? tal) test-registry (loop (car tal)(cdr tal)))))) @@ -331,11 +331,11 @@ (db:test-get-rundir prev-testdat)) ;; ) (waivers (if testconfig (configf:section-vars testconfig "waivers") '())) (waiver-rx (regexp "^(\\S+)\\s+(.*)$")) (diff-rule "diff %file1% %file2%") (logpro-rule "diff %file1% %file2% | logpro %waivername%.logpro %waivername%.html")) - (if (not (file-exists? test-rundir)) + (if (not (common:file-exists? test-rundir)) (begin (debug:print-error 0 *default-log-port* "test run directory is gone, cannot propagate waiver") #f) (begin (push-directory test-rundir) @@ -348,11 +348,11 @@ (wparts (if waiver (string-match waiver-rx waiver) #f)) (waiver-rule (if wparts (cadr wparts) #f)) (waiver-glob (if wparts (caddr wparts) #f)) (logpro-file (if waiver (let ((fname (conc hed ".logpro"))) - (if (file-exists? fname) + (if (common:file-exists? fname) fname (begin (debug:print 0 *default-log-port* "INFO: No logpro file " fname " falling back to diff") #f))) #f)) @@ -446,19 +446,19 @@ ;; (sqlite3:execute db "UPDATE tests SET first_warn=? WHERE run_id=? AND testname=? AND item_path=?;" val run-id test-name item-path))) (let ((category (hash-table-ref/default otherdat ":category" "")) (variable (hash-table-ref/default otherdat ":variable" "")) (value (hash-table-ref/default otherdat ":value" #f)) - (expected (hash-table-ref/default otherdat ":expected" #f)) - (tol (hash-table-ref/default otherdat ":tol" #f)) + (expected (hash-table-ref/default otherdat ":expected" "n/a")) + (tol (hash-table-ref/default otherdat ":tol" "n/a")) (units (hash-table-ref/default otherdat ":units" "")) (type (hash-table-ref/default otherdat ":type" "")) (dcomment (hash-table-ref/default otherdat ":comment" ""))) (debug:print 4 *default-log-port* "category: " category ", variable: " variable ", value: " value ", expected: " expected ", tol: " tol ", units: " units) - (if (and value expected tol) ;; all three required + (if (and value) ;; require only value; BB was- all three required (let ((dat (conc category "," variable "," value "," expected "," tol "," @@ -465,11 +465,13 @@ units "," dcomment ",," ;; extra comma for status type ))) ;; This was run remote, don't think that makes sense. Perhaps not, but that is the easiest path for the moment. (rmt:csv->test-data run-id test-id - dat)))) + dat) + (thread-sleep! 10) ;; add 10 second delay before quit incase rmt needs time to start a server. + ))) ;; need to update the top test record if PASS or FAIL and this is a subtest ;;;;;; (if (not (equal? item-path "")) ;;;;;; (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status #f) ;;;;;) @@ -865,11 +867,11 @@ '() (lambda (x p) (let* ((targ-path (string-intersperse p "/")) (full-path (conc linktree "/" targ-path)) (run-name (car (reverse p)))) - (if (and (file-exists? full-path) + (if (and (common:file-exists? full-path) (directory? full-path) (file-write-access? full-path)) (s:a run-name 'href (conc targ-path "/run-summary.html")) (begin (debug:print 0 *default-log-port* "INFO: Can't create " targ-path "/run-summary.html") @@ -904,11 +906,11 @@ path-parts)) test-dats)) (tests-htree (common:list->htree tests-tree-dat)) (html-dir (conc linktree "/" (string-intersperse run-dir "/"))) (html-path (conc html-dir "/run-summary.html")) - (oup (if (and (file-exists? html-dir) + (oup (if (and (common:file-exists? html-dir) (directory? html-dir) (file-write-access? html-dir)) (open-output-file html-path) #f))) ;; (print "run-dir: " run-dir ", tests-tree-dat: " tests-tree-dat) @@ -932,21 +934,21 @@ (item-path ;; (if (> (length p) 2) ;; test-name + run-name (string-intersperse p "/")) (full-targ (conc html-dir "/" targ-path)) (std-file (conc full-targ "/test-summary.html")) (alt-file (conc full-targ "/megatest-rollup-" test-name ".html")) - (html-file (if (file-exists? alt-file) + (html-file (if (common:file-exists? alt-file) alt-file std-file)) (run-name (car (reverse p)))) - (if (and (not (file-exists? full-targ)) + (if (and (not (common:file-exists? full-targ)) (directory? full-targ) (file-write-access? full-targ)) (tests:summarize-test run-id (rmt:get-test-id run-id test-name item-path))) - (if (file-exists? full-targ) + (if (common:file-exists? full-targ) (s:a run-name 'href html-file) (begin (debug:print 0 *default-log-port* "ERROR: can't access " full-targ) (conc "No summary for " run-name))))) )))))) @@ -1155,11 +1157,11 @@ ;; Gather data from test/task specifications ;;====================================================================== ;; (define (tests:get-valid-tests testsdir test-patts) ;; #!key (test-names '())) ;; (let ((tests (glob (conc testsdir "/tests/*")))) ;; " (string-translate patt "%" "*"))))) -;; (set! tests (filter (lambda (test)(file-exists? (conc test "/testconfig"))) tests)) +;; (set! tests (filter (lambda (test)(common:file-exists? (conc test "/testconfig"))) tests)) ;; (delete-duplicates ;; (filter (lambda (testname) ;; (tests:match test-patts testname #f)) ;; (map (lambda (testp) ;; (last (string-split testp "/"))) @@ -1190,11 +1192,11 @@ (let* ((use-cache (common:use-cache?)) (cache-path (tests:get-test-path-from-environment)) (cache-file (and cache-path (conc cache-path "/.testconfig"))) (cache-exists (and cache-file (not force-create) ;; if force-create then pretend there is no cache to read - (file-exists? cache-file))) + (common:file-exists? cache-file))) (cached-dat (if (and (not force-create) cache-exists use-cache) (handle-exceptions exn @@ -1214,11 +1216,11 @@ (let* ((treg (or test-registry (tests:get-all))) (test-path (or (hash-table-ref/default treg test-name #f) (conc *toppath* "/tests/" test-name))) (test-configf (conc test-path "/testconfig")) - (testexists (and (file-exists? test-configf)(file-read-access? test-configf))) + (testexists (and (common:file-exists? test-configf)(file-read-access? test-configf))) (tcfg (if testexists (read-config test-configf #f system-allowed environ-patt: (if system-allowed "pre-launch-env-vars" #f)) @@ -1383,11 +1385,11 @@ ;; (define (tests:lazy-dot testrecords outtype sizex sizey) (let ((dfile (conc "/tmp/." (current-user-name) "-" (server:mk-signature) ".dot")) (fname (conc "/tmp/." (current-user-name) "-" (server:mk-signature) ".dotdat"))) (tests:write-dot-file testrecords dfile sizex sizey) - (if (file-exists? fname) + (if (common:file-exists? fname) (let ((res (with-input-from-file fname (lambda () (read-lines))))) (system (conc "env -i PATH=$PATH dot -T " outtype " < " dfile " > " fname "&")) res) Index: tests/Makefile ================================================================== --- tests/Makefile +++ tests/Makefile @@ -173,11 +173,11 @@ cd mintest;$(MEGATEST) -server - -debug $(DEBUG) > server.log 2> server.log & sleep 3 cd mintest;$(DASHBOARD) -rows 18 & cleanprep : ../*.scm Makefile */*.config build - mkdir -p fullrun/tmp/mt_runs fullrun/tmp/mt_links /tmp/$(USER)/adisk1 + mkdir -p fullrun/tmp/mt_runs fullrun/tmp/mt_links /tmp/$(USER)/adisk1 fullrun/logs rm -f */logging.db touch cleanprep fullprep : cleanprep cd fullrun;$(MEGATEST) -remove-runs :runname $(RUNNAME)% -target %/%/% -testpatt %/% Index: tests/fdktestqa/testqa/megatest.config ================================================================== --- tests/fdktestqa/testqa/megatest.config +++ tests/fdktestqa/testqa/megatest.config @@ -1,12 +1,14 @@ [setup] testcopycmd cp --remove-destination -rlv TEST_SRC_PATH/. TEST_TARG_PATH/. >> TEST_TARG_PATH/mt_launch.log 2>> TEST_TARG_PATH/mt_launch.log # launchwait no -launch-delay 0 +launch-delay 0.1 [server] -runtime 180 +# runtime 180 +# timeout is in hours, this is how long the server will stay alive when not being used. +timeout 0.1 # All these are overridden in ../fdk.config # [jobtools] # launcher nbfake # launcher bsub -q priority -o $MT_TEST_RUN_DIR/openlava.log Index: tests/fdktestqa/testqa/tests/bigrun2/testconfig ================================================================== --- tests/fdktestqa/testqa/tests/bigrun2/testconfig +++ tests/fdktestqa/testqa/tests/bigrun2/testconfig @@ -9,11 +9,11 @@ mode itemwait itemmap .*/ # Iteration for your tests are controlled by the items section [items] -NUMBER #{scheme (string-intersperse (map (lambda (x)(conc "blah/" x)) \ +NUMBER #{scheme (string-intersperse (map (lambda (x)(conc (if (getenv "USEBLAH") "blah/" "") x)) \ (map number->string (sort (let loop ((a 0)(res '())) \ (if (<= a (or (any->number (get-environment-variable "NUMTESTS")) 2500)) \ (loop (+ a 1)(cons a res)) res)) <))) " ")} Index: tests/tests.scm ================================================================== --- tests/tests.scm +++ tests/tests.scm @@ -17,10 +17,26 @@ (import srfi-18) ;; (require-extension zmq) ;; (import zmq) (define test-work-dir (current-directory)) + +;; given list of lists +;; ( ( msg expected param1 param2 ...) +;; ( ... ) ) +;; apply test to all +;; +(define (test-batch proc pname inlst #!key (post-proc #f)) + (for-each + (lambda (spec) + (let ((msg (conc pname " " (car spec))) + (result (cadr spec)) + (params (cddr spec))) + (if post-proc + (test msg result (post-proc (apply proc params))) + (test msg result (apply proc params))))) + inlst)) ;; read in all the _record files (let ((files (glob "*_records.scm"))) (for-each (lambda (file) Index: tests/unittests/all-rmt.scm ================================================================== --- tests/unittests/all-rmt.scm +++ tests/unittests/all-rmt.scm @@ -28,13 +28,25 @@ ;; DEF (rmt:kill-server run-id) ;; DEF (rmt:start-server run-id) (test #f '(#t "successful login")(rmt:login #f)) ;; DEF (rmt:login-no-auto-client-setup connection-info) (test #f #t (pair? (rmt:get-latest-host-load (get-host-name)))) + +;; get-latest-host-load does a lookup in the db, it won't return a useful value unless +;; a test ran recently on host +(test-batch rmt:get-latest-host-load + "rmt:get-latest-host-load" + (list (list "localhost" #t (get-host-name)) + (list "not-a-host" #t "not-a-host" )) + post-proc: pair?) + (test #f #t (list? (rmt:get-changed-record-ids 0))) + (test #f #f (begin (runs:update-all-test_meta #f) #f)) + (test #f '("test1" "test2")(sort (alist-ref "tagtwo" (hash-table->alist (rmt:get-tests-tags)) equal?) string<=)) + (test #f '() (rmt:get-key-val-pairs 0)) (test #f '("SYSTEM" "RELEASE") (rmt:get-keys)) (test #f '("SYSTEM" "RELEASE") (rmt:get-keys-write)) ;; dummy query to force server start (test #f '() (rmt:get-key-vals 1)) (test #f (vector '("SYSTEM" "RELEASE") '()) (rmt:get-targets)) @@ -82,11 +94,27 @@ (test #f '()(rmt:get-prev-run-ids 1)) (test #f #t (begin (rmt:lock/unlock-run 1 #t #f "mikey") #t)) (test #f "JUSTFINE" (rmt:get-run-status 1)) (test #f #t (begin (rmt:set-run-status 1 "NOTFINE" msg: "A message") #t)) (test #f #t (begin (rmt:update-run-event_time 1) #t)) + ;; (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit fields last-runs-update) ;; fields of #f uses default +;; +(let ((keys (rmt:get-keys)) + (rnp "%") ;; run name patt + (tpt "%/%")) ;; target patt + (test-batch rmt:get-runs-by-patt + "rmt:get-runs-by-patt" + (list (list "t=0" #t keys rnp tpt #f #f #f 0) + (list "t=current" #f keys rnp tpt #f #f #f (+ 100 (current-seconds))) ;; should be no records from the future + ) + post-proc: (lambda (res) + ;; (print "rmt:get-runs-by-patt returned: " res) + (and (vector? res) + (let ((rows (vector-ref res 1))) + (> (length rows) 0)))))) + ;; (rmt:find-and-mark-incomplete run-id ovr-deadtime) ;; (rmt:get-main-run-stats run-id) ;; (rmt:get-var varname) ;; (rmt:set-var varname value) ;; (rmt:find-and-mark-incomplete-all-runs #!key (ovr-deadtime #f)) Index: tests/unittests/tests.scm ================================================================== --- tests/unittests/tests.scm +++ tests/unittests/tests.scm @@ -12,67 +12,70 @@ ;; (non-completed (runs:calc-not-completed prereqs-not-met)) ;; (runnables (runs:calc-runnable prereqs-not-met))) ;; ;; ;; - (define user (current-user-name)) (define runname "mytestrun") (define keys (rmt:get-keys)) (define runinfo #f) (define keyvals '(("SYSTEM" "abc")("RELEASE" "def"))) (define header (list "SYSTEM" "RELEASE" "id" "runname" "state" "status" "owner" "event_time")) +(define contour #f) (define run-id 1) - +(define new-comment #f) ;; Create a run -(test #f 1 (rmt:register-run keyvals runname "new" "n/a" user)) +(test #f 1 (rmt:register-run keyvals runname "new" "n/a" user contour)) (test #f #t (rmt:general-call 'register-test run-id run-id "test-one" "")) (test #f #t (rmt:general-call 'register-test run-id run-id "test-two" "")) (test #f #t (rmt:general-call 'register-test run-id run-id "test-three" "")) (test #f #t (rmt:general-call 'register-test run-id run-id "test-four" "")) -(rmt:test-set-state-status-by-id run-id (rmt:get-test-id run-id "test-one" "") "COMPLETED" "FAIL" "") -(rmt:test-set-state-status-by-id run-id (rmt:get-test-id run-id "test-two" "") "COMPLETED" "PASS" "") -(rmt:test-set-state-status-by-id run-id (rmt:get-test-id run-id "test-three" "") "RUNNING" "n/a" "") -(rmt:test-set-state-status-by-id run-id (rmt:get-test-id run-id "test-four" "") "COMPLETED" "WARN" "") + +;; (define (rmt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment) +(rmt:test-set-state-status-by-id run-id (rmt:get-test-id run-id "test-one" "") "COMPLETED" "FAIL" new-comment) +(rmt:test-set-state-status-by-id run-id (rmt:get-test-id run-id "test-two" "") "COMPLETED" "PASS" new-comment) +(rmt:test-set-state-status-by-id run-id (rmt:get-test-id run-id "test-three" "") "RUNNING" "n/a" new-comment) +(rmt:test-set-state-status-by-id run-id (rmt:get-test-id run-id "test-four" "") "COMPLETED" "WARN" new-comment) -(print "MODE=not in") -(test #f '() +(test "MODE=not in" + '() (filter (lambda (y) (equal? y "FAIL")) ;; any FAIL in the output list? (map (lambda (x)(vector-ref x 4)) (rmt:get-tests-for-run run-id "%/%" '() '("FAIL") #f #f #t 'event_time "DESC" 'shortlist 0 'dashboard)))) -(print "MODE=in") -(test #f '("FAIL") +(test "MODE=in" + '("FAIL") (map (lambda (x)(vector-ref x 4)) (rmt:get-tests-for-run run-id "%/%" '() '("FAIL") #f #f #f 'event_time "DESC" 'shortlist 0 'dashboard))) (set! *verbosity* 1) -(print "MODE=in, state in RUNNING") ;; (set! *verbosity* 8) -(test #f '("RUNNING") +(test "MODE=in, state in RUNNING" '("RUNNING") (map (lambda (x)(vector-ref x 3)) (rmt:get-tests-for-run run-id "%/%" '("RUNNING") '() #f #f #f 'event_time "DESC" 'shortlist 0 'dashboard))) (set! *verbosity* 1) -(print "MODE=in, state in RUNNING and status IN WARN") ;; (set! *verbosity* 8) -(test #f '(("RUNNING" . "n/a") ("COMPLETED" . "WARN")) - (map - (lambda (x) - (cons (vector-ref x 3)(vector-ref x 4))) - (rmt:get-tests-for-run run-id "%/%" '("RUNNING") '("WARN") #f #f #f 'event_time "DESC" 'shortlist 0 'dashboard))) +;;(define (rmt:get-tests-for-run run-id testpatt states statuses offset limit not-in sort-by sort-order qryvals last-update mode) +(test + "MODE=in, state in RUNNING and status IN WARN" + '(("COMPLETED" . "WARN") ("RUNNING" . "n/a") ) + (map + (lambda (x) + (cons (vector-ref x 3)(vector-ref x 4))) + (rmt:get-tests-for-run run-id "%/%" '("RUNNING") '("WARN") #f #f #f 'event_time "DESC" 'shortlist 0 'dashboard))) (set! *verbosity* 1) -(print "MODE=not in, state in RUNNING and status IN WARN") (set! *verbosity* 8) -(test #f '(("DELETED" . "n/a") ("COMPLETED" . "PASS") ("COMPLETED" . "FAIL")) +(test "MODE=not in, state in RUNNING and status IN WARN" + '(("COMPLETED" . "PASS") ("COMPLETED" . "FAIL")) (map (lambda (x) (cons (vector-ref x 3)(vector-ref x 4))) (rmt:get-tests-for-run run-id "%/%" '("RUNNING") '("WARN") #f #f #t 'event_time "DESC" 'shortlist 0 'dashboard))) (set! *verbosity* 1) Index: utils/mk_wrapper ================================================================== --- utils/mk_wrapper +++ utils/mk_wrapper @@ -51,10 +51,14 @@ fi fi EOF fi + +cat >> $target << EOF +if [[ \$(ulimit -a | grep 'open files' | awk '{print \$4}') -gt 10000 ]];then ulimit -n 10000;fi +EOF # echo "#!/bin/bash" > $target # echo "exec $prefix/bin/.\$(lsb_release -sr)/$cmd \"\$@\"" >> $target echo "lsbr=\$(lsb_release -sr)" >> $target