Index: Makefile ================================================================== --- Makefile +++ Makefile @@ -28,11 +28,11 @@ ezsteps.scm lock-queue.scm sdb.scm rmt.scm api.scm \ subrun.scm portlogger.scm archive.scm env.scm \ diff-report.scm cgisetup/models/pgdb.scm # module source files -MSRCFILES = dbmod.scm +MSRCFILES = dbmod.scm adjutant.scm mutils.scm mttop.scm # ftail.scm rmtmod.scm commonmod.scm removed # MSRCFILES = ducttape-lib.scm pkts.scm stml2.scm cookie.scm mutils.scm \ # mtargs.scm commonmod.scm dbmod.scm adjutant.scm ulex.scm \ # rmtmod.scm apimod.scm @@ -93,11 +93,12 @@ csc $(CSCOPTS) $(OFILES) dashboard.scm $(GOFILES) $(MOFILES) $(MOIMPFILES) -o dboard mtut: $(OFILES) $(MOFILES) megatest-fossil-hash.scm mtut.scm megatest-version.scm csc $(CSCOPTS) $(OFILES) $(MOFILES) mtut.scm -o mtut -# include makefile.inc +include makefile.inc +include chicken.makefile TCMTOBJS = \ api.o \ archive.o \ cgisetup/models/pgdb.o \ Index: NOTES ================================================================== --- NOTES +++ NOTES @@ -12,10 +12,15 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Megatest. If not, see . + +(server:writable-watchdog-bruteforce dbstruct) + +(server:writable-watchdog-deltasync dbstruct) + ===================================================================== NOTES from looking at branch v1.62-rpc ===================================================================== Index: TODO ================================================================== --- TODO +++ TODO @@ -16,10 +16,17 @@ # along with Megatest. If not, see . TODO ==== +WW38 +. Add test_rundat to no-sync ==> correction, put in /.meta/test-run.dat +. Add STATE/STATUS transitions to .meta/test-run.dat or similar +. Swizzle update-test-rundat to operate on no-sync +. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync +. On state/status change update tests table with duration + WW15 . fill newview matrix with data, filter pipeline gui elements . improve [script], especially indent handling WW16 Index: adjutant.scm ================================================================== --- adjutant.scm +++ adjutant.scm @@ -22,12 +22,23 @@ (module adjutant * (import scheme chicken data-structures extras files) (import (prefix sqlite3 sqlite3:) posix typed-records srfi-18 srfi-69 - md5 message-digest + md5 message-digest matchable regex srfi-1) -(define (adjutant-run) - (print "Running the adjutant!")) +(define (adjutant-run host-type rmt:no-sync-take-job) + (print "Running the adjutant!") + (let loop ((wait-count 0)) + (if (< wait-count 10) ;; 6 x 10 seconds = one minute + (let* ((dat (rmt:no-sync-take-job host-type))) + (match dat + ((id ht vars exekey cmdline state event-time last-update) + (system cmdline) + (loop 0)) + (else + (thread-sleep! 10) + (loop (+ wait-count 1))))) + (print "I'm bored. Exiting.")))) ) Index: api.scm ================================================================== --- api.scm +++ api.scm @@ -159,21 +159,21 @@ ((> *api-process-request-count* 20) ;; 20) (debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.") (set! *server-overloaded* #t) (vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor! (else - (let* ((cmd-in (vector-ref dat 0)) + (let* ((cmd-in (common:safe-vector-ref dat 0 'nocmd)) (cmd (if (symbol? cmd-in) cmd-in (string->symbol cmd-in))) - (params (vector-ref dat 1)) + (params (common:safe-vector-ref dat 1 '())) (start-t (current-milliseconds)) (readonly-mode (dbr:dbstruct-read-only dbstruct)) (readonly-command (member cmd api:read-only-queries)) (writecmd-in-readonly-mode (and readonly-mode (not readonly-command))) #;(foo (begin - (common:telemetry-log (conc "api-in:"(->string cmd)) + #;(common:telemetry-log (conc "api-in:"(->string cmd)) payload: `((params . ,params))) #t)) (res (if writecmd-in-readonly-mode @@ -180,10 +180,12 @@ (conc "attempt to run write command "cmd" on a read-only database") (case cmd ;;=============================================== ;; READ/WRITE QUERIES ;;=============================================== + + ((nocmd) '(#f "All broken!")) ((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl ;; SERVERS ((start-server) (apply server:kind-run params)) @@ -256,17 +258,20 @@ ;; NO SYNC DB ((no-sync-set) (apply db:no-sync-set *no-sync-db* params)) ((no-sync-get/default) (apply db:no-sync-get/default *no-sync-db* params)) ((no-sync-del!) (apply db:no-sync-del! *no-sync-db* params)) ((no-sync-get-lock) (apply db:no-sync-get-lock *no-sync-db* params)) + ((no-sync-add-job) (apply db:no-sync-add-job *no-sync-db* params)) + ((no-sync-take-job) (apply db:no-sync-take-job *no-sync-db* params)) + ((no-sync-job-records-clean) (apply db:no-sync-job-records-clean *no-sync-db* params)) ;; ARCHIVES ;; ((archive-get-allocations) ((archive-register-disk) (apply db:archive-register-disk dbstruct params)) ((archive-register-block-name)(apply db:archive-register-block-name dbstruct params)) ;; ((archive-allocate-testsuite/area-to-block)(apply db:archive-allocate-testsuite/area-to-block dbstruct block-id testsuite-name areakey)) - + ;;====================================================================== ;; READ ONLY QUERIES ;;====================================================================== ;; KEYS @@ -386,10 +391,12 @@ (debug:print 4 *default-log-port* "server-id:" *server-id*) (let* ((cmd ($ 'cmd)) (paramsj ($ 'params)) (key ($ 'key)) (params (db:string->obj paramsj transport: 'http))) ;; incoming data from the POST (or is it a GET?) + (success (common:safe-vector-ref resdat 0 #f)) + (res (common:safe-vector-ref resdat 1 #f))) ;; (vector flag payload), get the payload, ignore the flag (why?) (debug:print 4 *default-log-port* "cmd:" cmd " with params " params "key " key) (if (equal? key *server-id*) (begin (set! *api-process-request-count* (+ *api-process-request-count* 1)) (let* ((resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result ) @@ -410,7 +417,9 @@ ;; res ;; (list "ERROR, not string, list, number or boolean" 1 cmd params res))))) (db:obj->string res transport: 'http))) (begin (debug:print 0 *default-log-port* "Server refused to process request. Sever id mismatch. recived " key " expected: " *server-id* ".\nOther arguments recived: cmd=" cmd " params = " params) - (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http))))) + (db:obj->string (conc "Server refused to process request server-id mismatch: " key ", " *server-id*) transport: 'http)))) + + Index: cgisetup/models/pgdb.scm ================================================================== --- cgisetup/models/pgdb.scm +++ cgisetup/models/pgdb.scm @@ -177,15 +177,15 @@ state status owner event-time comment fail-count pass-count last_update publish-time run-id area-id )) ;; given all needed info create run record ;; (define (pgdb:insert-run dbh ttype-id target run-name state status owner event-time comment fail-count pass-count area-id last-update publish-time) - (dbi:exec + (dbi:exec dbh "INSERT INTO runs (ttype_id,target,run_name,state,status,owner,event_time,comment,fail_count,pass_count,area_id,last_update,publish_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?, ?);" - ttype-id target run-name state status owner event-time comment fail-count pass-count area-id last-update publish-time)) + ttype-id target run-name state status owner event-time comment fail-count pass-count area-id last-update publish-time)) ;;====================================================================== ;; T E S T - S T E P S ;;====================================================================== Index: chicken.makefile ================================================================== --- chicken.makefile +++ chicken.makefile @@ -23,11 +23,11 @@ # CHICKEN_BIN_DIR=$(shell dirname $(shell which csi)) # if have csi on path use that, else use default # CSIPATH=$(shell which csi) # CKPATH=$(shell dirname $(shell dirname $(CSIPATH))) -sCHICKEN_PREFIX=$(or $(CKPATH),$(PREFIX)/bin/.$(ARCHSTR)) +CHICKEN_PREFIX=$(or $(CKPATH),$(PREFIX)/bin/.$(ARCHSTR)) whatever : @echo "CHICKEN_PREFIX=$(CHICKEN_PREFIX)" tgz-$(USER)/postgresql-9.6.4.tar.gz : @@ -66,10 +66,11 @@ cd tgz-$(USER)/nanomsg-1.0.0/build-$(USER); make; make install $(CHICKEN_PREFIX)/chicken-4.13.0/LICENSE : tgz-$(USER)/chicken-4.13.0.tar.gz mkdir -p build-$(USER)/eggs-installed cd build-$(USER);tar xf ../tgz-$(USER)/chicken-4.13.0.tar.gz + if [[ -e $(CHICKEN_PREFIX)/chicken-4.13.0/LICENSE ]];then touch $(CHICKEN_PREFIX)/chicken-4.13.0/LICENSE;fi tgz-$(USER)/opensrc.fossil : cd tgz-$(USER); fossil clone http://www.kiatoa.com/fossils/opensrc opensrc.fossil mkdir tgz-$(USER)/opensrc cd tgz-$(USER)/opensrc; fossil open --nested ../opensrc.fossil; fossil up; fossil uv sync @@ -101,11 +102,11 @@ cd tgz-$(USER)/ffcall; make CC="gcc -fPIC"; make install $(CHICKEN_PREFIX)/bin/sqlite3 : build-$(USER)/sqlite-autoconf-3090200/configure cd build-$(USER)/sqlite-autoconf-3090200; ./configure --prefix=$(CHICKEN_PREFIX); make; make install -$(CHICKEN_PREFIX)/bin/csi : $(CHICKEN_PREFIX)/bin/sqlite3 $(CHICKEN_PREFIX)/lib/libiupweb.so $(CHICKEN_PREFIX)/chicken-4.13.0/LICENSE +$(CHICKEN_PREFIX)/bin/csi : $(SQLITE3_DEP) $(CHICKEN_PREFIX)/lib/libiupweb.so $(CHICKEN_PREFIX)/chicken-4.13.0/LICENSE cd build-$(USER)/chicken-4.13.0;make PLATFORM=linux PREFIX=$(CHICKEN_PREFIX) cd build-$(USER)/chicken-4.13.0;make PLATFORM=linux PREFIX=$(CHICKEN_PREFIX) install ALL_CKBIN=chicken chicken-bind chicken-bug chicken-dump \ chicken-install chicken-profile chicken-sqlite3 chicken-status \ @@ -112,35 +113,56 @@ chicken-uninstall csc csi feathers nanocat sqlite3 vacuumdb logpro \ refdb CKBIN_WRAPPERS=$(addprefix $(PREFIX)/bin/,$(ALL_CKBIN)) -$(PREFIX)/bin/% : $(CHICKEN_PREFIX)/bin/% $(CHICKEN_PREFIX)/bin/csi $(EGGSTARG2) +$(PREFIX)/bin/% : $(CHICKEN_PREFIX)/bin/% $(CHICKEN_PREFIX)/bin/csi utils/mk_wrapper_tool $(PREFIX) $* $(PREFIX)/bin/$* chmod a+x $(PREFIX)/bin/$* $(PREFIX)/bin : mkdir -p $(PREFIX)/bin $(CHICKEN_PREFIX)/bin -chicken : $(PREFIX)/bin $(CHICKEN_PREFIX)/bin/csi binwrappers +# For the future - binwrappers +chicken : $(PREFIX)/bin $(CHICKEN_PREFIX)/bin/csi postgresql.done nanomsg.done iup.done canvas-draw.done sqlite3.done sql-de-lite.done dbi.done $(EGGSTARG2) @echo "Fake target to build prefix chicken" binwrappers : $(CKBIN_WRAPPERS) -postgresql.done : $(CHICKEN_PREFIX)/bin/pg_config +# make the dep a dummy if not requiring our own build of postgres +ifeq ($(BUILD_POSTGRES),yes) +PG_DEP=$(CHICKEN_PREFIX)/bin/pg_config +else +PG_DEP=$(CHICKEN_PREFIX)/bin/csi +endif + +postgresql.done : $(PG_DEP) CSC_OPTIONS="-I$(CHICKEN_PREFIX)/include -L$(CHICKEN_PREFIX)/lib -L$(CHICKEN_PREFIX)/lib64" $(CHICKEN_PREFIX)/bin/chicken-install postgresql > postgresql.done -nanomsg.done : $(CHICKEN_PREFIX)/lib/libnanomsg.so +ifeq ($(BUILD_NANOMSG),yes) +NMSG_DEP=$(CHICKEN_PREFIX)/lib/libnanomsg.so +else +NMSG_DEP=$(CHICKEN_PREFIX)/bin/csi +endif + +nanomsg.done : $(NMSG_DEP) CSC_OPTIONS="-I$(CHICKEN_PREFIX)/include -L$(CHICKEN_PREFIX)/lib -L$(CHICKEN_PREFIX)/lib64" $(CHICKEN_PREFIX)/bin/chicken-install nanomsg > nanomsg.done iup.done : $(CHICKEN_PREFIX)/lib/libcallback.a CSC_OPTIONS="-I$(CHICKEN_PREFIX)/include -L$(CHICKEN_PREFIX)/lib" $(CHICKEN_PREFIX)/bin/chicken-install -D no-library-checks -feature disable-iup-web -feature disable-iup-pplot -feature disable-iup-matrixex iup > iup.done canvas-draw.done : CSC_OPTIONS="-I$(CHICKEN_PREFIX)/include -L$(CHICKEN_PREFIX)/lib" $(CHICKEN_PREFIX)/bin/chicken-install -D no-library-checks canvas-draw > canvas-draw.done -sqlite3.done : +# make the dep a dummy if not requiring our own build of postgres +ifeq ($(BUILD_SQLITE3),yes) +SQLITE3_DEP=$(CHICKEN_PREFIX)/bin/sqlite3 +else +SQLITE3_DEP=$(CHICKEN_PREFIX)/bin/csi +endif + +sqlite3.done : $(SQLITE3_DEP) CSC_OPTIONS="-I$(CHICKEN_PREFIX)/include -L$(CHICKEN_PREFIX)/lib" $(CHICKEN_PREFIX)/bin/chicken-install sqlite3 > sqlite3.done sql-de-lite.done : CSC_OPTIONS="-I$(CHICKEN_PREFIX)/include -L$(CHICKEN_PREFIX)/lib" $(CHICKEN_PREFIX)/bin/chicken-install sql-de-lite > sql-de-lite.done Index: common.scm ================================================================== --- common.scm +++ common.scm @@ -498,10 +498,20 @@ (copy hrsfile daysfile)) #t) #f)) ;;====================================================================== +(define (common:safe-vector-ref vec indx default) + (if (vector? vec) + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "remote data issue: exn=" exn) + default) + (vector-ref vec indx)) + default)) + ;; Rotate logs, logic: ;; if > 500k and older than 1 week: ;; remove previous compressed log and compress this log ;; WARNING: This proc operates assuming that it is in the directory above the ;; logs directory you wish to log-rotate. @@ -1045,11 +1055,11 @@ ((dbr:dbstruct-read-only dbstruct) (debug:print-info 13 *default-log-port* "loading read-only watchdog") (common:readonly-watchdog dbstruct)) (else (debug:print-info 13 *default-log-port* "loading writable-watchdog.") - (let* ((syncer (or (configf:lookup *configdat* "server" "sync-method") "brute-force-sync"))) + (let* ((syncer (or (configf:lookup *configdat* "server" "sync-method") "delta-sync"))) ;; "brute-force-sync"))) (cond ((equal? syncer "brute-force-sync") (server:writable-watchdog-bruteforce dbstruct)) ((equal? syncer "delta-sync") (server:writable-watchdog-deltasync dbstruct)) @@ -2675,10 +2685,12 @@ ("mode-patt" . "-modepatt") ("run-name" . "-runname") ("contour" . "-contour") ("target" . "-target") ("test-patt" . "-testpatt") + ("rerun" . "-rerun") + ("setvars" . "-setvars") ("msg" . "-m") ("log" . "-log") ("start-dir" . "-start-dir") ("new" . "-set-state-status")))) (if (eq? flavor 'switch-symbol) @@ -2754,10 +2766,39 @@ (hash-table-for-each vars (lambda (var val) (setenv var val))) vars)) + +(define (common:propogate-mt-vars-to-subrun proc propogate-vars) + (let ((vars (make-hash-table)) + (var-patt "^MT_.*")) + (for-each + (lambda (vardat) ;; each env var + ;(for-each + ;(lambda (var-patt) + (if (string-match var-patt (car vardat)) + (let ((var (car vardat)) + (val (cdr vardat))) + (hash-table-set! vars var val) + (if (member var propogate-vars) + (begin + (print var " " (string-substitute "MT_" "PARENT_" var)) + (setenv (string-substitute "MT_" "PARENT_" var) val))) + (unsetenv var)))) +; var-patts)) + (get-environment-variables)) + (cond + ((string? proc)(system proc)) + (proc (proc))) + (hash-table-for-each + vars + (lambda (var val) + (if (member var propogate-vars) + (unsetenv (string-substitute "MT_" "PARENT_" var))) + (setenv var val))) + vars)) (define (common:run-a-command cmd #!key (with-vars #f) (with-orig-env #f)) (let* ((pre-cmd (dtests:get-pre-command)) (post-cmd (dtests:get-post-command)) (fullcmd (if (or pre-cmd post-cmd) @@ -3223,31 +3264,39 @@ ;; arm cubie01 cubie02 ;; x86_64 zeus xena myth01 ;; allhosts #{g hosts arm} #{g hosts x86_64} ;; ;; [host-types] +;; C/M/A lets megatest know this launcher provides C cores, M bytes memory for architecture A +;; 2/2G/arm smart -cores 2 -memory 2G -arch arm ;; general #MTLOWESTLOAD #{g hosts allhosts} ;; arm #MTLOWESTLOAD #{g hosts arm} ;; nbgeneral nbjob run JOBCOMMAND -log $MT_LINKTREE/$MT_TARGET/$MT_RUNNAME.$MT_TESTNAME-$MT_ITEM_PATH.lgo +;; +;; NOTE: host-rules is ONLY used for MTLOWESTLOAD ;; ;; [host-rules] ;; # maxnload => max normalized load ;; # maxnjobs => max jobs per cpu ;; # maxjobrate => max jobs per second ;; general maxnload=1.1; maxnjobs=1.2; maxjobrate=0.1 ;; ;; [launchers] ;; envsetup general -;; xor/%/n 4C16G +;; xor/%/n 2/2G/arm ;; % nbgeneral ;; ;; [jobtools] ;; # if defined and not "no" flexi-launcher will bypass "launcher" unless no match. ;; flexi-launcher yes ;; launcher nbfake +;; mode adjutant|normal (default is normal) +;; +;; +;; mode is 'normal (i.e. directly use launcher) or 'adjutant (i.e. use adjutant) ;; -(define (common:get-launcher configdat testname itempath) +(define (common:get-launcher configdat testname itempath mode) (let ((fallback-launcher (configf:lookup configdat "jobtools" "launcher"))) (if (and (configf:lookup configdat "jobtools" "flexi-launcher") ;; overrides launcher (not (equal? (configf:lookup configdat "jobtools" "flexi-launcher") "no"))) (let* ((launchers (hash-table-ref/default configdat "launchers" '()))) (if (null? launchers) @@ -3254,32 +3303,36 @@ fallback-launcher (let loop ((hed (car launchers)) (tal (cdr launchers))) (let ((patt (car hed)) (host-type (cadr hed))) - (if (tests:match patt testname itempath) + (if (tests:match patt testname itempath) ;; have a launcher match for this test (begin (debug:print-info 2 *default-log-port* "Have flexi-launcher match for " testname "/" itempath " = " host-type) - (let ((launcher (configf:lookup configdat "host-types" host-type))) + (let ((launcher (configf:lookup configdat "host-types" host-type))) ;; find the actual launcher from the host-types table + ;; if we are in adjutant mode then we want to return both host-type and launcher (if launcher (let* ((launcher-parts (string-split launcher)) (launcher-exe (car launcher-parts))) (if (equal? launcher-exe "#MTLOWESTLOAD") ;; this is our special case, we will find the lowest load and craft a nbfake commandline (let host-loop ((targ-host (common:get-least-loaded-host (cdr launcher-parts) host-type configdat)) (count 100)) (if targ-host (conc "remrun " targ-host) (if (> count 0) + (begin (debug:print 0 *default-log-port* "INFO: Waiting for a host for host-type " host-type) (thread-sleep! (- 101 count)) (host-loop (common:get-least-loaded-host (cdr launcher-parts) host-type configdat) (- count 1))) (begin (debug:print 0 *default-log-port* "FATAL: Failed to find a host from #MTLOWESTLOAD for host-type " host-type) (exit))))) - launcher)) + (case mode + ((adjutant) (list host-type launcher)) + (else launcher)))) (begin (debug:print-info 0 *default-log-port* "WARNING: no launcher found for host-type " host-type) (if (null? tal) fallback-launcher (loop (car tal)(cdr tal))))))) Index: configf.scm ================================================================== --- configf.scm +++ configf.scm @@ -787,11 +787,11 @@ ;; if (define (configf:read-alist fname) (handle-exceptions exn (begin - (debug:print 0 *default-log-port* "read of alist " fname " failed. exn=" exn) + (debug:print-info 0 *default-log-port* "unable to read alist " fname ". exn=" exn) #f) (configf:alist->config (with-input-from-file fname read)))) (define (configf:write-alist cdat fname) Index: configure ================================================================== --- configure +++ configure @@ -71,11 +71,11 @@ ARCHSTR=$(/usr/bin/sw_vers -productVersion) else ARCHSTR=$(lsb_release -sr) fi -echo "CHICKEN_PREFIX=$PREFIX/.$ARCHSTR" >> makefile.inc +echo "CKPATH=$PREFIX/.$ARCHSTR" >> makefile.inc CHICKEN_PREFIX=$PREFIX/bin/.$ARCHSTR if [[ ! $(type csi) ]];then echo "Chicken build needed." echo "BUILD_CHICKEN=yes" >> makefile.inc Index: dashboard.scm ================================================================== --- dashboard.scm +++ dashboard.scm @@ -1950,27 +1950,42 @@ ;; (set! colnum (+ colnum 1)) )))) run-ids))) (define (dashboard:tests-ht->tests-dat tests-ht) - (reverse - (sort - (hash-table-values tests-ht) - (lambda (a b) - (let ((a-test-name (db:test-get-testname a)) - (a-item-path (db:test-get-item-path a)) - (b-test-name (db:test-get-testname b)) - (b-item-path (db:test-get-item-path b)) - (a-event-time (db:test-get-event_time a)) - (b-event-time (db:test-get-event_time b))) - (if (not (equal? a-test-name b-test-name)) - (> a-event-time b-event-time) - (cond - ((< 0 (string-compare3 a-test-name b-test-name)) #t) - ((> 0 (string-compare3 a-test-name b-test-name)) #f) - ((< 0 (string-compare3 a-item-path b-item-path)) #t) - (else #f)))))))) + (let ((oldest-item (make-hash-table))) ;; + ;; populate the oldest-item table + (for-each + (lambda (tdat) + (let ((tname (db:test-get-testname tdat)) + (etime (db:test-get-event_time tdat))) + (if (hash-table-exists? oldest-item tname) + (if (< (hash-table-ref oldest-item tname) etime) + (hash-table-set! oldest-item tname etime)) + (hash-table-set! oldest-item tname etime)))) + (hash-table-values tests-ht)) + (reverse + (sort + (hash-table-values tests-ht) + (lambda (a b) + (let ((a-test-name (db:test-get-testname a)) + (a-item-path (db:test-get-item-path a)) + (b-test-name (db:test-get-testname b)) + (b-item-path (db:test-get-item-path b)) + (a-event-time (db:test-get-event_time a)) + (b-event-time (db:test-get-event_time b))) + (if (equal? a-test-name b-test-name) + (> a-event-time b-event-time) + (> (hash-table-ref oldest-item a-test-name) + (hash-table-ref oldest-item b-test-name))))))))) +;; (if (not (equal? a-test-name b-test-name)) +;; (> a-event-time b-event-time) +;; (cond +;; ((< 0 (string-compare3 a-test-name b-test-name)) #t) +;; ((> 0 (string-compare3 a-test-name b-test-name)) #f) +;; ((< 0 (string-compare3 a-item-path b-item-path)) #t) +;; (else #f))))))))) (define (dashboard:run-id->tests-mindat run-id tabdat runs-hash) (let* ((run (hash-table-ref/default runs-hash run-id #f)) (key-vals (rmt:get-key-vals run-id)) @@ -2065,10 +2080,12 @@ (iup:attribute-set! run-matrix "NUMCOL" max-col )) (let ((effective-max-row (if (< max-row max-visible) max-visible max-row))) (if (> effective-max-row (string->number (iup:attribute run-matrix "NUMLIN"))) (iup:attribute-set! run-matrix "NUMLIN" effective-max-row ))) + + (iup:attribute-set! run-matrix "WIDTHDEF" 16) ;; Row labels (for-each (lambda (ind) (let* ((name (car ind)) (num (cadr ind)) @@ -2104,20 +2121,20 @@ (iup:attribute-set! run-matrix key (cadr value)) (iup:attribute-set! run-matrix (conc "BGCOLOR" key) (car value)))))) matrix-content) ;; Col labels - do after setting Cell contents so they are accounted for in the size calc. - + (for-each (lambda (ind) (let* ((name (car ind)) (num (cadr ind)) (key (conc "0:" num))) (if (not (equal? (iup:attribute run-matrix key) name)) (begin (set! changed #t) - (iup:attribute-set! run-matrix key name) - (if (<= num max-col) + (iup:attribute-set! run-matrix key name) ;; (list->string (intersperse (string->list name) #\newline))) ;; name) + #;(if (<= num max-col) (iup:attribute-set! run-matrix "FITTOTEXT" (conc "C" num))))))) col-indices) (if (and (eq? pass-num 0) changed) (loop 1 #t)) ;; force second pass due to column labels changing Index: db.scm ================================================================== --- db.scm +++ db.scm @@ -1781,10 +1781,33 @@ #f ) (with-input-from-file infile read-lines) ))) +;; check duration against test-run.dat file if it exists and update the value in +;; the db if necessary +;; +(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration) + (let* ((datf (conc run-dir ".mt_data/test-run.dat")) + (modt (if (and (file-exists? datf) + (file-read-access? datf)) + (file-modification-time datf) + #f)) ;; (+ event-time run-duration)))) + (alt-run-duration (if modt + (- modt event-time) + #f))) + (if (and alt-run-duration + (> alt-run-duration run-duration)) + (begin + (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration) + (db:with-db + dbstruct #f #f + (lambda (db) + (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id) + #t))) + #f))) ;; #f = we did NOT adjust the time + ;; select end_time-now from ;; (select testname,item_path,event_time+run_duration as ;; end_time,strftime('%s','now') as now from tests where state in ;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED')); @@ -1833,37 +1856,39 @@ ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) - (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" - test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) - " event-time="event-time" run-duration="run-duration)))) + (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) + (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" + test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) + " event-time="event-time" run-duration="run-duration))))) stmth1 run-id running-deadtime) ;; default time 720 seconds - + (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) - (if (and (equal? uname "n/a") - (equal? item-path "")) ;; this is a toplevel test - ;; what to do with toplevel? call rollup? - (begin - (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) - (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) - (begin - (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id - " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time - " run-duration="run-duration) - (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))) + (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) + (if (and (equal? uname "n/a") + (equal? item-path "")) ;; this is a toplevel test + ;; what to do with toplevel? call rollup? + (begin + (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) + (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) + (begin + (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id + " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time + " run-duration="run-duration) + (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))) stmth2 run-id remotehoststart-deadtime) ;; default time 230 seconds ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; @@ -2147,14 +2172,66 @@ (db-exists (common:file-exists? dbname)) (db (sqlite3:open-database dbname))) (sqlite3:set-busy-handler! db (sqlite3:make-busy-timeout 136000)) (if (not db-exists) (begin - (sqlite3:execute db "PRAGMA synchronous = 0;") (sqlite3:execute db "CREATE TABLE IF NOT EXISTS no_sync_metadat (var TEXT,val TEXT, CONSTRAINT no_sync_metadat_constraint UNIQUE (var));") (sqlite3:execute db "PRAGMA journal_mode=WAL;"))) + ;; MOVE THIS TABLE CREATION TO THE (begin above in about six months (it is Sep 2020 right now). + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS jobs_queue (id INTEGER PRIMARY KEY, host_type TEXT, cores INTEGER, memory TEXT, vars TEXT, exekey TEXT, cmdline TEXT, state TEXT, event_time INTEGER, last_update INTEGER);") + ;; not sure I'll use this next one. I prefer if tests simply append to a file: + ;; last-update-seconds cpuload tmpspace rundirspace + (sqlite3:execute db "CREATE TABLE IF NOT EXISTS test_extra_data (id INTEGER PRIMARY KEY, run_id INTEGER, test_id INTEGER, last_seen_running INTEGER);") + (sqlite3:execute db "PRAGMA synchronous = 0;") db)) + +(define (db:no-sync-add-job db-in host-type vars-list exekey cmdline) + (sqlite3:execute (db:no-sync-db db-in) "INSERT INTO jobs_queue (host_type,vars,exekey,cmdline,state,event_time,last_update) VALUES (?,?,?,?,?,?,?);" + host-type + (with-output-to-string + (lambda () + (write vars-list))) + exekey cmdline "waiting" (current-seconds)(current-seconds))) + +;; find next job (waiting longest) that matches host-type - future, we'll find jobs that fit if no exact match +(define (db:no-sync-take-job db-in host-type) + (let* ((db (db:no-sync-db db-in)) + (stmt1 "SELECT id,host_type,vars,exekey,cmdline,state,event_time,last_update FROM jobs_queue WHERE host_type=? AND state != 'taken' ORDER BY event_time ASC;") + (stmt1h (sqlite3:prepare db stmt1)) + (stmt2 "UPDATE jobs_queue SET state='taken',last_update=? WHERE id=?;") + (stmt2h (sqlite3:prepare db stmt2)) + (res (sqlite3:with-transaction + db + (lambda () + (let* ((matching-jobs (sqlite3:fold-row + (lambda (res . row) ;; id host-type vars exekey state event-time last-update) + (cons row res)) + '() + stmt1h + host-type))) + (if (null? matching-jobs) + #f + (let ((choosen-one (let loop ((tal matching-jobs) + (res #f)) ;; put bestest one in here + (if (null? tal) + res + (let ((curr (car tal)) + (rem (cdr tal))) + curr) ;; here we will compare with res, if better candidate the loop with curr else loop with res + )))) + (if choosen-one ;; we need to mark it as taken + (sqlite3:execute stmt2h (current-seconds) (car choosen-one))) + choosen-one))))))) + (sqlite3:finalize! stmt1h) ;; it'd be nice to cache these and finalize on exit. + (sqlite3:finalize! stmt2h) + res)) + +;; clean out old jobs in queue, i.e. taken and event_time > 24 hrs ago +;; +(define (db:no-sync-job-records-clean db) + (sqlite3:execute (db:no-sync-db db) "DELETE FROM jobs_queue WHERE state='taken' AND event_time < ?;" (- (current-seconds)(* 24 3600)))) + ;; if we are not a server create a db handle. this is not finalized ;; so watch for problems. I'm still not clear if it is needed to manually ;; finalize sqlite3 dbs with the sqlite3 egg. ;; @@ -2172,12 +2249,13 @@ (sqlite3:execute (db:no-sync-db db) "INSERT OR REPLACE INTO no_sync_metadat (var,val) VALUES (?,?);" var val)) (define (db:no-sync-del! db var) (sqlite3:execute (db:no-sync-db db) "DELETE FROM no_sync_metadat WHERE var=?;" var)) -(define (db:no-sync-get/default db var default) - (let ((res default)) +(define (db:no-sync-get/default db-in var default) + (let ((db (db:no-sync-db db-in)) + (res default)) (sqlite3:for-each-row (lambda (val) (set! res val)) (db:no-sync-db db) "SELECT val FROM no_sync_metadat WHERE var=?;" @@ -3220,21 +3298,19 @@ test-id)))))) (mt:process-triggers dbstruct run-id test-id newstate newstatus)) ;; NEW BEHAVIOR: Count tests running in all runs! ;; -(define (db:get-count-tests-running dbstruct run-id) ;; fastmode) - (let* ((qry ;; (if fastmode - ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;" - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")) ;; ) - (db:with-db - dbstruct - run-id - #f - (lambda (db) - (let* ((stmth (db:get-cache-stmth dbstruct db qry))) - (sqlite3:first-result stmth)))))) +(define (db:get-count-tests-running dbstruct run-id) + (let* ((qry "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")) + (db:with-db + dbstruct + run-id + #f + (lambda (db) + (let* ((stmth (db:get-cache-stmth dbstruct db qry))) + (sqlite3:first-result stmth)))))) ;; NEW BEHAVIOR: Count tests running in only one run! ;; (define (db:get-count-tests-actually-running dbstruct run-id) (db:with-db @@ -3250,21 +3326,23 @@ run-id)))) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');") ;; NEW BEHAVIOR: Look only at single run with run-id ;; ;; (define (db:get-running-stats dbstruct run-id) -(define (db:get-count-tests-running-for-run-id dbstruct run-id) ;; fastmode) - (let* ((qry ;; (if fastmode - ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;" - "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) ;; ) +(define (db:get-count-tests-running-for-run-id dbstruct run-id) + (let* ((qry "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")) (db:with-db dbstruct run-id #f (lambda (db) (let* ((stmth (db:get-cache-stmth dbstruct db qry))) - (sqlite3:first-result stmth run-id)))))) + (sqlite3:fold-row + (lambda (res val) val) + 0 stmth run-id)))))) + +;; (sqlite3:first-result stmth run-id)))))) ;; For a given testname how many items are running? Used to determine ;; probability for regenerating html ;; (define (db:get-count-tests-running-for-testname dbstruct run-id testname) @@ -3273,23 +3351,23 @@ run-id #f (lambda (db) (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;") (stmth (db:get-cache-stmth dbstruct db stmt))) - (sqlite3:first-result - stmth run-id testname))))) + (sqlite3:fold-row + (lambda (res val) val) 0 stmth run-id testname))))) (define (db:get-not-completed-cnt dbstruct run-id) -(db:with-db + (db:with-db dbstruct run-id #f (lambda (db) - ;(print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id) - (sqlite3:first-result - db - "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;" run-id)))) + (let* ((stmt "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;")) + (sqlite3:fold-row + (lambda (res val) val) + 0 (db:get-cache-stmth dbstruct db stmt) run-id))))) (define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup) (if (not jobgroup) 0 ;; (let ((testnames '())) @@ -3472,11 +3550,11 @@ (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) -;; Get test data using test_id, run-id is not used +;; Get test data using test_id, run-id is not used - but it will be! ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct #f ;; run-id Index: dcommon.scm ================================================================== --- dcommon.scm +++ dcommon.scm @@ -953,14 +953,13 @@ (+ yoffset (* y scalef))) ;; sizex, sizey - canvas size ;; originx, originy - canvas origin ;; -(define (dcommon:initial-draw-tests cnv xadj yadj sizex sizey sizexmm sizeymm originx originy tests-draw-state sorted-testnames test-records) - (let* ((dot-data ;; (map cdr (filter - ;; (lambda (x)(equal? "node" (car x))) - (map string-split (tests:lazy-dot test-records "plain" sizex sizey))) ;; (tests:easy-dot test-records "plain"))) +(define (dcommon:initial-draw-tests cnv xadj yadj sizex sizey sizexmm sizeymm originx originy + tests-draw-state sorted-testnames test-records) + (let* ((dot-data (tests:lazy-dot test-records "plain" sizex sizey 'munged)) (xoffset (dcommon:get-xoffset tests-draw-state sizex xadj)) (yoffset (dcommon:get-yoffset tests-draw-state sizey yadj)) (no-dot (configf:lookup *configdat* "setup" "nodot")) (boxh 15) (boxw 10) Index: docs/manual/megatest_manual.html ================================================================== --- docs/manual/megatest_manual.html +++ docs/manual/megatest_manual.html @@ -2671,10 +2671,25 @@
[setup]
 # this will automatically kill the test if it runs for more than 1h 2m and 3s
 runtimelim 1h 2m 3s
+
+
Post Run Hook
+

This runs script to-run.sh after all tests have been completed. It is +not necessary to use -run-wait as each test will check for other +running tests on completion and if there are none it will call the +post run hook.

+

Note that the output from the script call will be placed in a log file +in the logs directory with a file name derived by replacing / with _ +in post-hook-<target>-<runname>.log.

+
+
+
[runs]
+post-hook /path/to/script/to-run.sh
+
+

Tests browser view

The tests browser (see the Run Control tab on the dashboard) has two views for displaying the tests.

@@ -3247,10 +3262,21 @@

Ezsteps

+

Ezsteps is the recommended way to implement tests and automation in +Megatest.

+
+ + + +
+Note +Each ezstep must be a single line. Use the [scripts] mechanism +to create multiline scripts (see example below).
+
Example ezsteps with logpro rules
[ezsteps]
 lookittmp   ls /tmp
@@ -3258,20 +3284,76 @@
 [logpro]
 lookittmp ;; Note: config file format supports multi-line entries where leading whitespace is removed from each line
   ;;     a blank line indicates the end of the block of text
   (expect:required in "LogFileBody" > 0 "A file name that should never exist!" #/This is a awfully stupid file name that should never be found in the temp dir/)
-

To transfer the environment to the next step you can do the following:

+
+

Automatic environment propagation with Ezsteps

+

Turn on ezpropvars and environment variables will be propagated from +step to step. Use this to source script files that modify the +envionment where the modifications are needed in subsequent steps.

+
+ + + +
+Note +aliases and variables with strange whitespace or characters will +not propagate correctly. Put in a ticket on the +http://www.kiatoa.com/fossils/megatest site if you need support for a +specific strange character combination.
+
+
+
Turn on auto propagate for bash
+
+
[setup]
+ezpropvars sh
+
+
+
Write your ezsteps. The loadenv.csh step will use /bin/csh as its shell, other steps will use bash.
+
+
[ezsteps]
+loadenv.csh source $REF/ourenviron.csh
+compile make
+install make install
+
+

Bash and csh are supported. You can override the shell binary location +from the default /bin/bash and /bin/csh if needed.

+
+
Turn on auto propagate for csh
+
+
[setup]
+ezpropvars csh /bin/csh
+
+
+
Example of auto propagation using extensions
+
+
[ezsteps]
+step1.sh export SOMEVAR=$(ps -def | wc -l);ls /tmp
+# The next step will get the value of $SOMEVAR from step1.sh
+step2.sh echo $SOMEVAR
+
# if your upstream file is csh you can force csh like this # if your upstream is bash loadenv source $REF/ourenviron.sh
-
Propagate environment to next step
+
Example of multi-line script
-
$MT_MEGATEST -env2file .ezsteps/${stepname}
+
[scripts]
+tarresults tar cfvz $DEST/srcdir1.tar.gz srcdir1
+  tar cfvz $DEST/srcdir2.tar.gz srcdir2
+
+[setup]
+ezpropvars sh
+
+[ezsteps]
+step1 DEST=/tmp/targz;source tarresults
+

The above example will result in files; tarresults and ez_step1 being +created in the test dir.

+

Scripts

Specifying scripts inline (best used for only simple scripts)
@@ -3739,10 +3821,10 @@

Index: docs/manual/reference.txt ================================================================== --- docs/manual/reference.txt +++ docs/manual/reference.txt @@ -724,10 +724,16 @@ --------------------------- Ezsteps ~~~~~~~ +Ezsteps is the recommended way to implement tests and automation in +Megatest. + +NOTE: Each ezstep must be a single line. Use the [scripts] mechanism +to create multiline scripts (see example below). + .Example ezsteps with logpro rules ----------------- [ezsteps] lookittmp ls /tmp @@ -736,20 +742,72 @@ ;; a blank line indicates the end of the block of text (expect:required in "LogFileBody" > 0 "A file name that should never exist!" #/This is a awfully stupid file name that should never be found in the temp dir/) ----------------- -To transfer the environment to the next step you can do the following: +Automatic environment propagation with Ezsteps +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # if your upstream file is csh you can force csh like this # if your upstream is bash loadenv source $REF/ourenviron.sh -.Propagate environment to next step ----------------------------- -$MT_MEGATEST -env2file .ezsteps/${stepname} ----------------------------- +Turn on ezpropvars and environment variables will be propagated from +step to step. Use this to source script files that modify the +envionment where the modifications are needed in subsequent steps. + +NOTE: aliases and variables with strange whitespace or characters will +not propagate correctly. Put in a ticket on the +http://www.kiatoa.com/fossils/megatest site if you need support for a +specific strange character combination. + +.Turn on auto propagate for bash +--------------------------- +[setup] +ezpropvars sh +--------------------------- + +.Write your ezsteps. The loadenv.csh step will use /bin/csh as its shell, other steps will use bash. +--------------------------- +[ezsteps] +loadenv.csh source $REF/ourenviron.csh +compile make +install make install +--------------------------- + +Bash and csh are supported. You can override the shell binary location +from the default /bin/bash and /bin/csh if needed. + +.Turn on auto propagate for csh +--------------------------- +[setup] +ezpropvars csh /bin/csh +--------------------------- + +.Example of auto propagation using extensions +--------------------------- +[ezsteps] +step1.sh export SOMEVAR=$(ps -def | wc -l);ls /tmp +# The next step will get the value of $SOMEVAR from step1.sh +step2.sh echo $SOMEVAR +--------------------------- + +.Example of multi-line script +--------------------------- +[scripts] +tarresults tar cfvz $DEST/srcdir1.tar.gz srcdir1 + tar cfvz $DEST/srcdir2.tar.gz srcdir2 + +[setup] +ezpropvars sh + +[ezsteps] +step1 DEST=/tmp/targz;source tarresults +--------------------------- + +The above example will result in files; tarresults and ez_step1 being +created in the test dir. Scripts ~~~~~~~ .Specifying scripts inline (best used for only simple scripts) Index: env.scm ================================================================== --- env.scm +++ env.scm @@ -21,20 +21,22 @@ (declare (unit env)) (use sql-de-lite) ;; srfi-1 posix regex regex-case srfi-69 srfi-18 call-with-environment-variables) (define (env:open-db fname) - (let* ((db-exists (common:file-exists? fname)) + (let* ((db-exists (if (equal? fname ":memory:") + #f + (common:file-exists? fname))) (db (open-database fname))) (if (not db-exists) (begin - (exec (sql db "CREATE TABLE envvars ( - id INTEGER PRIMARY KEY, - context TEXT NOT NULL, - var TEXT NOT NULL, - val TEXT NOT NULL, - CONSTRAINT envvars_constraint UNIQUE (context,var))")))) + (exec (sql db "CREATE TABLE IF NOT EXISTS envvars ( + id INTEGER PRIMARY KEY, + context TEXT NOT NULL, + var TEXT NOT NULL, + val TEXT NOT NULL, + CONSTRAINT envvars_constraint UNIQUE (context,var))")))) (set-busy-handler! db (busy-timeout 10000)) db)) ;; save vars in given context, this is NOT incremental by default ;; @@ -77,10 +79,33 @@ val))))) (sql db "SELECT var,val FROM envvars WHERE context=?") context)) contexts) result)) + +;; envdelta: a-b (start=a, end=b, get the delta) +;; ofile: #f = write to stdout, else write to file with string name +;; +(define (env:envdelta db envdelta ofile) + (let ((match (string-split envdelta "-")));; (string-match "([a-z0-9_]+)=([a-z0-9_\\-,]+)" envdelta))) + (if (not (null? match)) + (let* ((parts match) ;; (string-split equn "-")) + (minuend (car parts)) + (subtraend (cadr parts)) + (added (env:get-added db minuend subtraend)) + (removed (env:get-removed db minuend subtraend)) + (changed (env:get-changed db minuend subtraend))) + ;; (pp (hash-table->alist added)) + ;; (pp (hash-table->alist removed)) + ;; (pp (hash-table->alist changed)) + (if (args:get-arg "-o") + (with-output-to-file + (args:get-arg "-o") + (lambda () + (env:print added removed changed))) + (env:print added removed changed))) + #f))) ;; get list of removed variables between two contexts ;; (define (env:get-removed db contexta contextb) (let ((result (make-hash-table))) Index: ezsteps.scm ================================================================== --- ezsteps.scm +++ ezsteps.scm @@ -34,21 +34,82 @@ (include "key_records.scm") (include "db_records.scm") (include "run_records.scm") -;;(rmt:get-test-info-by-id run-id test-id) -> testdat +(define (ezsteps:step-name->mode stepname) + (match (string-search "\\.([^\\.]+)$" stepname) + ((_ ext) (string->symbol ext)) + (else #f))) + +(define (ezsteps:create-step-script envdbf stepname prevstepname mode cmd shellexe) + (let* (#;(shebang (case mode + ((sh) "/bin/sh") + ((csh) "/bin/csh") + (else "/bin/bash"))) + (sourcef (conc ".ezsteps/vars_" prevstepname "." mode)) + (scriptn (conc "ez_" stepname))) ;; remember the name already has an extension .sh, .csh etc. + (with-output-to-file scriptn + (lambda () + ;; the shebang line + (print "#!" shellexe) + ;; save the env at start + (print "megatest -envcap "stepname"_start "envdbf) + ;; source vars from previous steps + (if (file-exists? sourcef) + (print "source " sourcef)) + ;; run the command + (print cmd) + (if (eq? mode 'csh) + (print "set ecode=$?") + (print "ecode=$?")) + ;; save the env at end + (print "megatest -envcap "stepname"_end "envdbf) + ;; write the delta + (print "megatest -envdelta "stepname"_start-"stepname"_end -dumpmode bash -o .ezsteps/vars_"stepname".sh "envdbf) + (print "megatest -envdelta "stepname"_start-"stepname"_end -dumpmode csh -o .ezsteps/vars_"stepname".csh "envdbf) + (print "exit $ecode"))) + (system (conc "chmod a+x " scriptn)))) + +(define (ezsteps:get-ezpropvars res) ;; testconfig) + ;; (let* ((res (configf:lookup testconfig "setup" "ezpropvars"))) + (if (string? res) + (let* ((dat (string-split res))) + (match dat + ((s shellexe) + (let ((shl (string->symbol s))) + `(,shl . ,shellexe))) + ((s) + (let* ((shl (string->symbol s)) + (shellexe (if (eq? shl 'csh) "/bin/csh" "/bin/bash"))) + `(,shl . ,shellexe))) + (else #f))) + #f)) -;; TODO: deprecate me in favor of ezsteps.scm +;; NOTE: returns logpro-used? ;; -(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat) +(define (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat prevstepname envdbf) (let* ((stepname (car ezstep)) ;; do stuff to run the step + (stepmode-n (ezsteps:step-name->mode stepname)) (stepinfo (cadr ezstep)) - ;; (let ((info (cadr ezstep))) - ;; (if (proc? info) "" info))) - ;; (stepproc (let ((info (cadr ezstep))) - ;; (if (proc? info) info #f))) + (shellmode (ezsteps:get-ezpropvars (configf:lookup testconfig "setup" "ezpropvars"))) ;; returns '(csh|sh . "/path/to/shell") + (stepmode (if stepmode-n ;; the .sh or .csh always wins + stepmode-n + (if shellmode + (car shellmode) + #f))) + (shellexe (if stepmode-n + (case stepmode + ((csh) "/bin/csh") + (else "/bin/bash")) + (if shellmode + (cdr shellmode) + "/bin/bash"))) + ;; (let ((info (cadr ezstep))) + ;; (if (proc? info) "" info))) + ;; (stepproc (let ((info (cadr ezstep))) + ;; (if (proc? info) info #f))) (stepparts (string-match (regexp "^(\\{([^\\}\\{]*)\\}\\s*|)(.*)$") stepinfo)) (stepparams (if (and (list? stepparts) (> (length stepparts) 1)) (list-ref stepparts 2) #f)) ;; for future use, {VAR=1,2,3}, run step for each @@ -80,13 +141,16 @@ ";;") (print tconfig-logpro))) (set! logpro-used #t))) ;; NB// can safely assume we are in test-area directory - (debug:print 4 *default-log-port* "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo " stepparts: " stepparts + (debug:print 4 *default-log-port* "ezsteps:\n stepname: " stepname " stepinfo: " stepinfo + " stepparts: " stepparts " stepparams: " stepparams " stepcmd: " stepcmd) - + + (if stepmode (ezsteps:create-step-script envdbf stepname prevstepname stepmode stepcmd shellexe)) + ;; ;; first source the previous environment ;; (let ((prev-env (conc ".ezsteps/" prevstep (if (string-search (regexp "csh") ;; (get-environment-variable "SHELL")) ".csh" ".sh")))) ;; (if (and prevstep (common:file-exists? prev-env)) ;; (set! script (conc script "source " prev-env)))) @@ -98,18 +162,21 @@ (rmt:teststep-set-status! run-id test-id stepname "start" "-" #f #f) ;; now launch the actual process (call-with-environment-variables (list (cons "PATH" (conc (get-environment-variable "PATH") ":."))) (lambda () ;; (process-run "/bin/bash" "-c" "exec ls -l /tmp/foobar > /tmp/delme-more.log 2>&1") - (let* ((cmd (conc stepcmd " > " stepname ".log 2>&1")) ;; >outfile 2>&1 + (let* ((cmd (conc (if stepmode + (conc "ez_" stepname) + stepcmd) + " > " stepname ".log 2>&1")) ;; >outfile 2>&1 (pid #f)) (let ((proc (lambda () (set! pid (process-run "/bin/bash" (list "-c" cmd)))))) (if subrun (begin (debug:print-info 0 *default-log-port* "Running "cmd" without MT_.* environment variables.") - (common:without-vars proc "^MT_.*")) + (common:propogate-mt-vars-to-subrun proc '("MT_TARGET" "MT_LINKTREE" "MT_RUNNAME"))) (proc))) (with-output-to-file "Makefile.ezsteps" (lambda () (print stepname ".log :") Index: launch.scm ================================================================== --- launch.scm +++ launch.scm @@ -168,40 +168,44 @@ (append (or ezstepslst '()) (list (list "subrun" (conc "{subrun=true} " mt-cmd))))))) ;; process the ezsteps (if ezsteps - (let* ((all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; where 'params is the params list (add other stuff as needed) + (let* ((envdbf (conc "/tmp/."(current-user-name)"-"(current-process-id)"-"run-id"-"test-id".db")) + (all-steps-dat (make-hash-table))) ;; keep all the info around as stepname ==> alist; + ;;; where 'params is the params list (add other + ;;; stuff as needed) (if (not (common:file-exists? ".ezsteps"))(create-directory ".ezsteps")) ;; if ezsteps was defined then we are sure to have at least one step but check anyway (if (not (> (length ezstepslst) 0)) (debug:print-error 0 *default-log-port* "ezsteps defined but ezstepslst is zero length") (let ((all-step-names (map car ezstepslst)) (status-file (file-open "ezsteps.status" (+ open/append open/wronly open/creat))) ) (setenv "MT_STEP_NAMES" (string-intersperse all-step-names " ")) (let loop ((ezstep (car ezstepslst)) - (tal (cdr ezstepslst)) - (prevstep #f)) - (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"") - ;; check exit-info (vector-ref exit-info 1) - (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) - (let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m tal testconfig all-steps-dat)) - (stepname (car ezstep)) - (stepparms (hash-table-ref all-steps-dat stepname))) - (setenv "MT_STEP_NAME" stepname) - (pp (hash-table->alist all-steps-dat)) - ;; if logpro-used read in the stepname.dat file - (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) - (launch:load-logpro-dat run-id test-id stepname)) + (tal (cdr ezstepslst)) + (prevstep #f)) + (debug:print-info 0 *default-log-port* "Processing ezstep \"" (string-intersperse ezstep " ") "\"") + ;; check exit-info (vector-ref exit-info 1) + (if (launch:einf-exit-status exit-info) ;; (vector-ref exit-info 1) + (let* ((logpro-used (launch:runstep ezstep run-id test-id exit-info m + tal testconfig all-steps-dat prevstep envdbf)) + (stepname (car ezstep)) + (stepparms (hash-table-ref all-steps-dat stepname))) + (setenv "MT_STEP_NAME" stepname) + (pp (hash-table->alist all-steps-dat)) + ;; if logpro-used read in the stepname.dat file + (if (and logpro-used (common:file-exists? (conc stepname ".dat"))) + (launch:load-logpro-dat run-id test-id stepname)) (file-write status-file (conc stepname " " (launch:einf-exit-code exit-info) "\n")) - (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms) - (if (not (null? tal)) - (loop (car tal) (cdr tal) stepname)) - (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) + (if (steprun-good? logpro-used (launch:einf-exit-code exit-info) stepparms) + (if (not (null? tal)) + (loop (car tal) (cdr tal) stepname)) + (debug:print 0 *default-log-port* "WARNING: step " (car ezstep) " failed. Stopping"))) (debug:print 0 *default-log-port* "WARNING: a prior step failed, stopping at " ezstep))) (file-close status-file) ) )))))) @@ -216,17 +220,19 @@ (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) - (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10) + (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync)) + ;; top of loop encountered at "(current-seconds)" with + ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load @@ -253,25 +259,20 @@ (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) - (when do-sync - ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append) - ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes))))) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds))) - (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f) - ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))) - ) - + (if do-sync ;; save meta data about the running of this test + (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) + ) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this @@ -324,11 +325,14 @@ (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) - (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional + (tests:update-central-meta-info run-id test-id + (get-cpu-load) + (get-df (current-directory)) + (calc-minutes) #f #f update-db: #t)) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) @@ -410,11 +414,10 @@ ;; one more time, change to the work-area directory (change-directory work-area))) ) ;; let* (if contour (setenv "MT_CONTOUR" contour)) - ;; immediated set some key variables from CMDINFO data, yes, these will be set again below ... ;; (setenv "MT_TESTSUITENAME" areaname) (setenv "MT_RUN_AREA_HOME" top-path) (set! *toppath* top-path) @@ -477,10 +480,13 @@ (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. + ;;((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun + ;; (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked + ;; (member (db:test-get-status test-info) '("ABORT")))) ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) @@ -809,11 +815,13 @@ (loop (car tal) (cdr tal))))))))))) ;; replaced below with version that does not ssh if checking on localhost #;(define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) - (let* ((cmd (conc "ssh " host " pstree -A " pid)) + (let* ((is-local (equal? host (get-host-name))) + (ssh-cmd (if is-local " " (conc "ssh " host " "))) + (cmd (conc ssh-cmd "pstree -A " pid)) (output (with-input-from-pipe cmd read-lines))) (debug:print 2 *default-log-port* "Running " cmd " received " output) (if (eq? (length output) 0) #f #t)) @@ -1419,12 +1427,20 @@ (debug:print 1 *default-log-port* "INFO: search and mark zombie tests") (rmt:set-var key (current-seconds)) (rmt:find-and-mark-incomplete run-id #f)))) - +(defstruct launch:ajt + (vars '()) + (exekey #f) + (host-type #f) + (test-sig #f) + (cmdline #f)) +;; append vars +(define (launch:ajt-add-vars dat vars) + (launch:ajt-vars-set! dat (append (launch:ajt-vars dat) vars))) ;; 1. look though disks list for disk with most space ;; 2. create run dir on disk, path name is meaningful ;; 3. create link from run dir to megatest runs area ;; 4. remotely run the test on allocated host @@ -1431,44 +1447,41 @@ ;; - could be ssh to host from hosts table (update regularly with load) ;; - could be netbatch ;; (launch-test db (cadr status) test-conf)) (define (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat params) (mutex-lock! *launch-setup-mutex*) ;; setting variables and processing the testconfig is NOT thread-safe, reuse the launch-setup mutex - (let* ( ;; (lock-key (conc "test-" test-id)) - ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) - ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds - ;; (if (car lock) - ;; #t - ;; (if (> (current-seconds) expire-time) - ;; (begin - ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) - ;; (rmt:no-sync-del! lock-key) ;; destroy the lock - ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; - ;; (begin - ;; (thread-sleep! 1) - ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) + (let* (;; locking code removed from here commented out and pasted at end of file (item-path (item-list->path itemdat)) - (contour #f)) ;; NOT READY FOR THIS (args:get-arg "-contour"))) + (contour #f) ;; NOT READY FOR THIS (args:get-arg "-contour"))) + ;; launcher-mode will be 'adjutant or 'normal + (launcher-mode (string->symbol (or (configf:lookup *configdat* "jobtools" "mode") "normal"))) + (ajtdat (make-launch:ajt))) (let loop ((delta (- (current-seconds) *last-launch*)) (launch-delay (configf:lookup-number *configdat* "setup" "launch-delay" default: 0))) (if (> launch-delay delta) (begin (if (common:low-noise-print 1200 "test launch delay") ;; every two hours or so remind the user about launch delay. (debug:print-info 0 *default-log-port* "NOTE: test launches are delayed by " launch-delay " seconds. See megatest.config launch-delay setting to adjust.")) ;; launch of " test-name " for " (- launch-delay delta) " seconds")) (thread-sleep! (- launch-delay delta)) (loop (- (current-seconds) *last-launch*) launch-delay)))) (change-directory *toppath*) - (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute", *maybe* - the longer they are set the longer each launch takes (must be non-overlapping with the vars) - (append - (list - (list "MT_RUN_AREA_HOME" *toppath*) - (list "MT_TEST_NAME" test-name) - (list "MT_RUNNAME" runname) - (list "MT_ITEMPATH" item-path) - (list "MT_CONTOUR" contour) - ) - itemdat)) + (let ((var-list (append + (list + (list "MT_RUN_AREA_HOME" *toppath*) + (list "MT_TEST_NAME" test-name) + (list "MT_RUNNAME" runname) + (list "MT_ITEMPATH" item-path) + (list "MT_CONTOUR" contour) + ) + itemdat))) + ;; consolidate this code with the code in megatest.scm for + ;; "-execute", *maybe* - the longer they are set the longer + ;; each launch takes (must be non-overlapping with the vars) + (alist->env-vars var-list) + ;; the var-list into the ajtdat adjutant record whether it is needed or not. + (launch:ajt-add-vars ajtdat var-list)) + (let* ((tregistry (tests:get-all)) ;; third param (below) is system-allowed ;; for tconfig, why do we allow fallback to test-conf? (tconfig (or (tests:get-testconfig test-name item-path tregistry #t force-create: #t) (begin (debug:print 0 *default-log-port* "WARNING: falling back to pre-calculated testconfig. This is likely not desired.") @@ -1486,26 +1499,17 @@ ;; (memory (configf:lookup tconfig "requirements" "memory")) ;; (hosts (configf:lookup *configdat* "jobtools" "workhosts")) ;; I'm pretty sure this was never completed (remote-megatest (configf:lookup *configdat* "setup" "executable")) (run-time-limit (or (configf:lookup tconfig "requirements" "runtimelim") (configf:lookup *configdat* "setup" "runtimelim"))) - ;; FIXME SOMEDAY: not good how this is so obtuse, this hack is to - ;; allow running from dashboard. Extract the path - ;; from the called megatest and convert dashboard - ;; or dboard to megatest (local-megatest (common:find-local-megatest)) - #;(local-megatest (let* ((lm (car (argv))) - (dir (pathname-directory lm)) - (exe (pathname-strip-directory lm))) - (conc (if dir (conc dir "/") "") - (case (string->symbol exe) - ((dboard) "../megatest") - ((mtest) "../megatest") - ((dashboard) "megatest") - (else exe))))) - (launcher (common:get-launcher *configdat* test-name item-path)) ;; (configf:lookup *configdat* "jobtools" "launcher")) - (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path + (launcher (let ((l (common:get-launcher *configdat* test-name item-path launcher-mode))) + (if (string? l) + (string-split l) + l))) ;; some nonhomogenuity here. '(cmd param1 param2 ...) OR '(host-type launcher) + ;; (item-list->path itemdat))) ;; test-path is the full path including the item-path + (test-sig (conc (common:get-testsuite-name) ":" test-name ":" item-path)) (work-area #f) (toptest-work-area #f) ;; for iterated tests the top test contains data relevant for all (diskpath #f) (cmdparms #f) (fullcmd #f) ;; (define a (with-output-to-string (lambda ()(write x)))) @@ -1515,15 +1519,24 @@ (debug-param (append (if (args:get-arg "-debug") (list "-debug" (args:get-arg "-debug")) '()) (if (args:get-arg "-logging")(list "-logging") '()) (if (configf:lookup *configdat* "misc" "profilesw") (list (configf:lookup *configdat* "misc" "profilesw")) '())))) + ;; save the test-sig in the ajtdat record + (launch:ajt-test-sig-set! ajtdat test-sig) + ;; go ahead and figure out if we have a host-type from the + ;; launcher call above and save it in the ajtdat record + (if (and (eq? launcher-mode 'adjutant) + (list? launcher) + (> (length launcher) 1)) + (launch:ajt-host-type-set! ajtdat (car launcher))) + ;; (if hosts (set! hosts (string-split hosts))) ;; set the megatest to be called on the remote host (if (not remote-megatest)(set! remote-megatest local-megatest)) ;; "megatest")) (set! mt-bindir-path (pathname-directory remote-megatest)) - (if launcher (set! launcher (string-split launcher))) + ;; (if launcher (set! launcher (string-split launcher))) ;; yuk! ;; set up the run work area for this test (if (and (args:get-arg "-preclean") ;; user has requested to preclean for this run (not (member (db:test-get-rundir testinfo)(list "n/a" "/tmp/badname")))) ;; n/a is a placeholder and thus not a read dir (begin (debug:print-info 0 *default-log-port* "attempting to preclean directory " (db:test-get-rundir testinfo) " for test " test-name "/" item-path) @@ -1579,64 +1592,93 @@ (list 'set-vars (if params (hash-table-ref/default params "-setvars" #f))) (list 'runname runname) (list 'mt-bindir-path mt-bindir-path)))))))) (setenv "MT_CMDINFO" cmdparms) ;; setting this for use in nblauncher + + ;; save the cmdparms in the ajtdat + (launch:ajt-exekey-set! ajtdat cmdparms) ;; clean out step records from previous run if they exist ;; (rmt:delete-test-step-records run-id test-id) ;; if the dir does not exist we may have a itempath where individual variables are a path, launch anyway (if (common:file-exists? work-area) (change-directory work-area)) ;; so that log files from the launch process don't clutter the test dir - (cond - ;; ((and launcher hosts) ;; must be using ssh hostname - ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) - ;; (set! fullcmd (append launcher (car hosts)(list remote-megatest test-sig "-execute" cmdparms)))) + + ;; save the command line for adjutant mode (might never be needed but best to assemble it here) + (launch:ajt-cmdline-set! ajtdat (string-intersperse + (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) + (cond (launcher (set! fullcmd (append launcher (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param))) - ;; (set! fullcmd (append launcher (list remote-megatest test-sig "-execute" cmdparms)))) (else (if (not useshell)(debug:print 0 *default-log-port* "WARNING: internal launching will not work well without \"useshell yes\" in your [jobtools] section")) (set! fullcmd (append (list remote-megatest "-m" test-sig "-execute" cmdparms) debug-param (list (if useshell "&" "")))))) - ;; (set! fullcmd (list remote-megatest test-sig "-execute" cmdparms (if useshell "&" ""))))) + (if (args:get-arg "-xterm")(set! fullcmd (append fullcmd (list "-xterm")))) (debug:print 1 *default-log-port* "Launching " work-area) ;; set pre-launch-env-vars before launching, keep the vars in prevvals and put the envionment back when done (debug:print 4 *default-log-port* "fullcmd: " fullcmd) (set! *last-launch* (current-seconds)) ;; all that junk above takes time, set this as late as possible. - (let* ((commonprevvals (alist->env-vars - (hash-table-ref/default *configdat* "env-override" '()))) - (miscprevvals (alist->env-vars ;; consolidate this code with the code in megatest.scm for "-execute" - (append (list (list "MT_TEST_RUN_DIR" work-area) - (list "MT_TEST_NAME" test-name) - (list "MT_ITEM_INFO" (conc itemdat)) - (list "MT_RUNNAME" runname) - (list "MT_TARGET" mt_target) - (list "MT_ITEMPATH" item-path) - ) - itemdat))) - (testprevvals (alist->env-vars - (hash-table-ref/default tconfig "pre-launch-env-overrides" '()))) + (let* ((env-override-vars (hash-table-ref/default *configdat* "env-override" '())) + (commonprevvals (alist->env-vars env-override-vars)) + (misc-vars (append (list (list "MT_TEST_RUN_DIR" work-area) + (list "MT_TEST_NAME" test-name) + (list "MT_ITEM_INFO" (conc itemdat)) + (list "MT_RUNNAME" runname) + (list "MT_TARGET" mt_target) + (list "MT_ITEMPATH" item-path)) + itemdat)) + (miscprevvals (alist->env-vars misc-vars));; consolidate this code with the code in megatest.scm for "-execute" + (test-vars (hash-table-ref/default tconfig "pre-launch-env-overrides" '())) + (testprevvals (alist->env-vars test-vars)) + ;; Launchwait defaults to true, must override it to turn off wait (launchwait (if (equal? (configf:lookup *configdat* "setup" "launchwait") "no") #f #t)) - (launch-results-prev (apply (if launchwait ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. - process:cmd-run-with-stderr-and-exitcode->list - process-run) - (if useshell - (let ((cmdstr (string-intersperse fullcmd " "))) - (if launchwait - cmdstr - (conc cmdstr " >> mt_launch.log 2>&1 &"))) - (car fullcmd)) - (if useshell - '() - (cdr fullcmd)))) + ;; BB: TODO: refactor this to examine return code of launcher, if nonzero, set state to launch failed. + (launch-results-prev (if (eq? launcher-mode 'adjutant) + '(#t 0) ;; just some fake data to fool downstream but non-applicable code + (apply (if launchwait + process:cmd-run-with-stderr-and-exitcode->list + process-run) + (if useshell + (let ((cmdstr (string-intersperse fullcmd " "))) + (if launchwait + cmdstr + (conc cmdstr " >> mt_launch.log 2>&1 &"))) + (car fullcmd)) + (if useshell + '() + (cdr fullcmd))))) (success (if launchwait (equal? 0 (cadr launch-results-prev)) #t)) (launch-results (if launchwait (car launch-results-prev) launch-results-prev))) - (if (not success) + + (launch:ajt-add-vars ajtdat env-override-vars) + (launch:ajt-add-vars ajtdat misc-vars) + (launch:ajt-add-vars ajtdat test-vars) + + ;; if in adjutant mode we register the job in the jobs_queue + ;; then fire off an adjutant runner + ;; + (if (eq? launcher-mode 'adjutant) + (let* ((adjutant-runner-cmd (append (cdr launcher) + (list remote-megatest "-adjutant" + (launch:ajt-host-type ajtdat) + "-start-dir" *toppath*))) + (adj-cmd (conc (string-intersperse (map conc adjutant-runner-cmd) " ") + "&"))) + (rmt:no-sync-add-job + (launch:ajt-host-type ajtdat) + (launch:ajt-vars ajtdat) + (launch:ajt-exekey ajtdat) + (launch:ajt-cmdline ajtdat)) + (print "adj-cmd: " adj-cmd) + (system adj-cmd) + )) + + (if (not success) (tests:test-set-status! run-id test-id "COMPLETED" "DEAD" "launcher failed; exited non-zero; check mt_launch.log" #f)) ;; (if launch-results launch-results "FAILED")) - (mutex-unlock! *launch-setup-mutex*) ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. ;; (rmt:no-sync-del! lock-key) ;; release the lock for starting this test (if (not launchwait) ;; give the OS a little time to allow the process to start (thread-sleep! 0.01)) (with-output-to-file "mt_launch.log" (lambda () @@ -1658,10 +1700,14 @@ (process-signal (current-process-id) signal/kill) )) (alist->env-vars miscprevvals) (alist->env-vars testprevvals) (alist->env-vars commonprevvals) + ;; yes, really should mutex all the way to here. Need to put this entire process into a fork. + ;; the unlock previously was further up. This seemed wrong as we should not proceed until the + ;; vars have been reset. + (mutex-unlock! *launch-setup-mutex*) launch-results)) (change-directory *toppath*) (thread-sleep! (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.0)))) ;; recover a test where the top controlling mtest may have died @@ -1687,5 +1733,21 @@ ;; now wait on that process if all is correct ;; periodically update the db with runtime ;; when the process exits look at the db, if still RUNNING after 10 seconds set ;; state/status appropriately (process-wait pid))) + + + ;; (lock-key (conc "test-" test-id)) + ;; (got-lock (let loop ((lock (rmt:no-sync-get-lock lock-key)) + ;; (expire-time (+ (current-seconds) 15))) ;; give up on getting the lock and steal it after 15 seconds + ;; (if (car lock) + ;; #t + ;; (if (> (current-seconds) expire-time) + ;; (begin + ;; (debug:print-info 0 *default-log-port* "Timed out waiting for a lock to launch test " keyvals " " runname " " test-name " " test-path) + ;; (rmt:no-sync-del! lock-key) ;; destroy the lock + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)) ;; + ;; (begin + ;; (thread-sleep! 1) + ;; (loop (rmt:no-sync-get-lock lock-key) expire-time)))))) + Index: megatest.scm ================================================================== --- megatest.scm +++ megatest.scm @@ -39,10 +39,17 @@ (declare (uses mt)) (declare (uses api)) (declare (uses tasks)) ;; only used for debugging. (declare (uses env)) (declare (uses diff-report)) +(declare (uses mutils)) +(declare (uses adjutant)) +(import adjutant) + +(declare (uses mttop)) +(import mttop) + ;; (declare (uses ftail)) ;; (import ftail) (define *db* #f) ;; this is only for the repl, do not use in general!!!! @@ -51,18 +58,20 @@ (include "db_records.scm") (include "run_records.scm") (include "megatest-fossil-hash.scm") (use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (prefix base64 base64:) - readline apropos json http-client directory-utils typed-records - http-client srfi-18 extras format) + readline apropos json http-client directory-utils typed-records matchable + http-client srfi-18 extras format call-with-environment-variables) ;; Added for csv stuff - will be removed ;; (use sparse-vectors) -(require-library mutils) +(import mutils) + +;;(require-library mutils) (define *usage-log-file* #f) ;; put path to file for logging usage in this var in the ~/.megatestrc file (define *usage-use-seconds* #t) ;; for Epoc seconds in usage logging change this to #t in ~/.megatestrc file ;; load the ~/.megatestrc file, put (use trace)(trace-call-sites #t)(trace function-you-want-to-trace) in this file @@ -102,10 +111,11 @@ Usage: megatest [options] -h : this help -manual : show the Megatest user manual -version : print megatest version (currently " megatest-version ") + help : help for the new Megatest interface Launching and managing runs -run : run all tests or as specified by -testpatt -remove-runs : remove the data for a run, requires -runname and -testpatt Optionally use :state and :status, use -keep-records to remove only @@ -199,11 +209,11 @@ -update-meta : update the tests metadata for all tests -setvars VAR1=val1,VAR2=val2 : Add environment variables to a run NB// these are overwritten by values set in config files. -server -|hostname : start the server (reduces contention on megatest.db), use - to automatically figure out hostname - -adjutant C,M : start the server/adjutant with allocated cores C and Mem M (Gig), + -adjutant host-type : start the server/adjutant with given host-type use 0,0 to auto use full machine -transport http|rpc : use http or rpc for transport (default is http) -log logfile : send stdout and stderr to logfile -list-servers : list the servers -kill-servers : kill all servers @@ -268,10 +278,14 @@ Version " megatest-version ", built from " megatest-fossil-hash )) ;; -gui : start a gui interface ;; -config fname : override the runconfigs file with fname + +(mttop-run (command-line-arguments) + '("help")) + ;; process args (define remargs (args:get-args (argv) (list "-runtests" ;; run a specific test "-config" ;; override the config file name @@ -877,11 +891,14 @@ (let* ((db (env:open-db (if (null? remargs) "envdat.db" (car remargs))))) (env:save-env-vars db envcap) (env:close-database db) (set! *didsomething* #t)))) -;; delta "language" will eventually be res=a+b-c but for now it is just res=a-b +;; delta "language" will eventually be res=a+b-c but for now it is just res=a-b +;; +;; db file can be stuck on the end of the command line: +;; megatest -envdelta start-end -dumpmode bash -o .ezsteps/step5.sh /tmp/myfile.db ;; (let ((envdelta (args:get-arg "-envdelta"))) (if envdelta (let ((match (string-split envdelta "-")));; (string-match "([a-z0-9_]+)=([a-z0-9_\\-,]+)" envdelta))) (if (not (null? match)) @@ -918,16 +935,47 @@ (let ((tl (launch:setup)) (transport-type (string->symbol (or (args:get-arg "-transport") "http")))) (server:launch 0 transport-type) (set! *didsomething* #t))) + +(define (naylist->alist inlst) + (map (lambda (dat) + (cons (car dat) + (or (if (list? (cdr dat)) + (if (null? (cdr dat)) "" + (cadr dat)) + (cdr dat)) + ""))) ;; we need a string for call-with-environment-variables + inlst)) + ;; The adjutant is a bit different, it does NOT run (launch:setup) as it is not necessarily tied to ;; a specific Megatest area. Detail are being hashed out and this may change. ;; (if (args:get-arg "-adjutant") - (begin - (adjutant-run) + (let* ((host-type (args:get-arg "-adjutant"))) + (launch:setup) ;; dang it, wish this wasn't needed + (print "Running the adjutant!") + (let loop ((wait-count 0)) + (if (< wait-count 10) ;; 6 x 10 seconds = one minute + (let* ((dat (rmt:no-sync-take-job host-type))) + (match dat + ((id ht vars exekey cmdline state event-time last-update) + (let ((vars-alist (with-input-from-string vars read) + )) + (print "Vars:") + (pp vars-alist) + (call-with-environment-variables + (naylist->alist vars-alist) + (lambda () + (system cmdline)))) + (loop 0)) + (else + (thread-sleep! 10) + (loop (+ wait-count 1))))) + (print "I'm bored. Exiting."))) + ;; (adjutant-run (args:get-arg "-ajutant") rmt:no-sync-take-job) (set! *didsomething* #t))) (if (or (args:get-arg "-list-servers") (args:get-arg "-kill-servers")) (let ((tl (launch:setup))) ADDED mttop.scm Index: mttop.scm ================================================================== --- /dev/null +++ mttop.scm @@ -0,0 +1,55 @@ +;; Copyright 2006-2011, Matthew Welland. +;; +;; This program is made available under the GNU GPL version 2.0 or +;; greater. See the accompanying file COPYING for details. +;; +;; This program is distributed WITHOUT ANY WARRANTY; without even the +;; implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +;; PURPOSE. + +;; This is from the perl world, a hash of hashes is a super easy way to keep a handle on +;; lots of disparate data +;; + +(declare (unit mttop)) + +(module mttop + * + +(import chicken scheme + ;; data-structures posix + srfi-1 + ;; srfi-13 + srfi-69 + ports + extras + regex + posix + data-structures + matchable + ) + +(define (str-is-cmd cmd all-cmds) + (let* ((rx (regexp (conc "^" cmd ".*"))) + (mx (filter string? (map (lambda (x) + (let ((res (string-match rx x))) + (if res (car res) #f))) + all-cmds)))) + (if (eq? (length mx) 1) ;; have a command + (car mx) + #f))) + +(define (mttop-run args all-cmds) + ;; any path through this call must end in exit if it is NOT an old Megatest call + (if (null? args) + #f ;; continue on and do the old Megatest stuff + (let ((cmd (str-is-cmd (car args) all-cmds))) + (if cmd + (begin + (case (string->symbol cmd) + ((help)(print "New help")) + (else (print "Command " cmd " is not implemented yet."))) + (exit)) ;; always exit here + #f)))) ;; or continue on to Megatest old stuff here + +) Index: rmt.scm ================================================================== --- rmt.scm +++ rmt.scm @@ -56,18 +56,39 @@ ;;====================================================================== (define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id +(define *rmt-query-last-call-time* 0) +(define *rmt-query-last-rest-time* 0) ;; last time there was at least a 1/2 second rest - giving other processes access to the db + +;; NOTE: This query rest algorythm will not adapt to long query times. REDESIGN NEEDED. TODO. FIXME. +;; +(define (rmt:query-rest cmd rid params) + (let* ((now (current-milliseconds))) + (cond + ((> (- now *rmt-query-last-call-time*) 100) ;; it's been a while since last query - no need to rest + (set! *rmt-query-last-rest-time* now) + (set! *rmt-query-last-call-time* now)) + ((> (- now *rmt-query-last-rest-time*) 5000) ;; no natural rests have happened + (debug:print 0 *default-log-port* "query rest needed. blocking for 0.1 second. cmd="cmd", run id="rid", params="params) + (thread-sleep! 0.1) ;; force a rest of a half second + (set! *rmt-query-last-rest-time* now) + (set! *rmt-query-last-call-time* now)) + (else ;; sufficient rests have occurred, just record the last query time + (set! *rmt-query-last-call-time* now))))) + ;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname)) ;; (define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected #;(common:telemetry-log (conc "rmt:"(->string cmd)) payload: `((rid . ,rid) (params . ,params))) - + (if (not (equal? (configf:lookup *configdat* "setup" "query-rest") "no")) + (rmt:query-rest cmd rid params)) + (if (> attemptnum 2) (debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum)) (cond ((> attemptnum 2) (thread-sleep! 0.05)) @@ -386,12 +407,12 @@ (> (vector-length v) 1)) (let ((newvec (vector (vector-ref v 0)(vector-ref v 1)))) newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record (vector #t '())))) ;; we could also check that the returned types are valid (vector #t '()))) - (success (vector-ref resdat 0)) - (res (vector-ref resdat 1)) + (success (common:safe-vector-ref resdat 0 #f)) ;; (vector-ref resdat 0)) + (res (common:safe-vector-ref resdat 1 #f)) ;; (vector-ref resdat 1)) (duration (- (current-milliseconds) start))) (if (and read-only qry-is-write) (debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd)) (if (not success) (if (> remretries 0) @@ -406,11 +427,11 @@ ;; (rmt:update-db-stats run-id cmd params duration) ;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it (if qry-is-write (let ((start-time (current-seconds))) (mutex-lock! *db-multi-sync-mutex*) -/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) + (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client) (mutex-unlock! *db-multi-sync-mutex*))))) res)) (define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params) (let* ((run-id (if run-id run-id 0)) @@ -542,15 +563,32 @@ (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) -;; run-id is NOT used +;; run-id is NOT used - but it will be! ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) - (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)) + (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))) + (trundir (vector-ref testdat 10)) + (trundatf (conc trundir"/.mt_data/test-run.dat"))) + ;; now we can update a couple fields from the filesystem + (handle-exceptions + exn + (begin + (debug:print-info 0 *default-log-port* "Could not update testdat record from "trundatf", exn=" exn) + #f) + (if (and trundir + (file-exists? trundatf)) + (let* ((duration (vector-ref testdat 12)) ;; (db:test-get-run_duration testdat)) + (event-time (vector-ref testdat 5)) ;; (db:test-get-event_time testdat)) + (last-touch (file-modification-time trundatf)) + (new-duration (max duration (- last-touch event-time)))) + (vector-set! testdat 12 new-duration)))) + #;(db:test-set-run_duration! testdat (max duration (- last-touch event-time))) + testdat) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) @@ -679,11 +717,13 @@ (define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f)) (rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps))) (define (rmt:get-count-tests-running-for-run-id run-id) - (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))) + (if (number? run-id) + (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id)) + 0)) (define (rmt:get-not-completed-cnt run-id) (rmt:send-receive 'get-not-completed-cnt run-id (list run-id))) @@ -945,10 +985,19 @@ (define (rmt:no-sync-del! var) (rmt:send-receive 'no-sync-del! #f `(,var))) (define (rmt:no-sync-get-lock keyname) (rmt:send-receive 'no-sync-get-lock #f `(,keyname))) + +(define (rmt:no-sync-add-job host-type vars-list exekey cmdline) + (rmt:send-receive 'no-sync-add-job #f `(,host-type ,vars-list ,exekey ,cmdline))) + +(define (rmt:no-sync-take-job host-type) + (rmt:send-receive 'no-sync-take-job #f `(,host-type))) + +(define (rmt:no-sync-job-records-clean) + (rmt:set-receive 'no-sync-job-records-clean #f '())) ;;====================================================================== ;; A R C H I V E S ;;====================================================================== Index: runs.scm ================================================================== --- runs.scm +++ runs.scm @@ -61,11 +61,38 @@ (defstruct runs:testdat hed tal reg reruns test-record test-name item-path jobgroup waitons testmode newtal itemmaps prereqs-not-met) + +(module runsmod + ( + runs:wait-if-seen-recently + ) +(import scheme chicken data-structures extras files) +(import posix typed-records srfi-18 srfi-69 + md5 message-digest + regex srfi-1) + +(define *last-seen-ht* (make-hash-table)) + +(define (runs:wait-if-seen-recently wait-until . keys) + (let* ((full-key (string-intersperse keys "-")) + (last-seen (hash-table-ref/default *last-seen-ht* full-key 0)) + (now (current-seconds)) + (delta (- now last-seen)) + (needed (if (< delta wait-until) + 0 + (- wait-until delta)))) + (if (> needed 0)(thread-sleep! needed)) + (hash-table-set! *last-seen-ht* full-key (current-seconds)) + needed)) +) + +(import runsmod) + ;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files ;; - remove any that are over 3600 seconds old ;; - if there are any that are younger than 10 seconds ;; * sleep 10 seconds ;; * touch my key-host-pid.softlock file @@ -492,10 +519,12 @@ ;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise. (define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon) (null? (tests:filter-test-names-not-matched waitors-upon test-patt))) + +(define *find-and-mark-incomplete-last-run* (make-hash-table)) ;;====================================================================== ;; runs:run-tests is called from megatest.scm and itself ;;====================================================================== ;; @@ -664,17 +693,17 @@ (runs:run-pre-hook run-id) ;; mark all test launched flag as false in the meta table (rmt:set-var (conc "lunch-complete-" run-id) "no") (debug:print-info 1 *default-log-port* "Setting end-of-run to no") (let* ((config-reruns (let ((x (configf:lookup *configdat* "setup" "reruns"))) - (if x (string->number x) #f))) - (config-rerun-cnt (if config-reruns - config-reruns - 1))) - (if (eq? config-rerun-cnt run-count) - (rmt:set-var (conc "end-of-run-" run-id) "no"))) - + (if x (string->number x) #f))) + (config-rerun-cnt (if config-reruns + config-reruns + 1))) + (if (eq? config-rerun-cnt run-count) + (rmt:set-var (conc "end-of-run-" run-id) "no"))) + (rmt:set-run-state-status run-id "new" "n/a") ;; now add non-directly referenced dependencies (i.e. waiton) ;;====================================================================== ;; refactoring this block into tests:get-full-data ;; @@ -788,34 +817,27 @@ (debug:print-info 4 *default-log-port* "test-records=" (hash-table->alist test-records)) (let ((reglen (configf:lookup *configdat* "setup" "runqueue"))) (if (> (length (hash-table-keys test-records)) 0) (let* ((keep-going #t) (run-queue-retries 5) - #;(th1 (make-thread (lambda () - (handle-exceptions - exn - (begin - (print-call-chain) - (print " message: " ((condition-property-accessor 'exn 'message) exn))) - (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests - (any->number reglen) all-tests-registry))) - "runs:run-tests-queue")) (th2 (make-thread (lambda () ;; BBQ: why are we visiting ALL runs here? ;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ... (let ((run-ids (rmt:get-all-run-ids))) (for-each (lambda (run-id) (if keep-going (handle-exceptions - exn - (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) - (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) + exn + (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn) + ;; lets run this only if a run has been NOT seen for more than 900 seconds + (if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900) + (begin + (rmt:find-and-mark-incomplete run-id #f) + (hash-table-set! *find-and-mark-incomplete-last-run* run-id (current-seconds))) + )))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27) run-ids))) "runs: mark-incompletes"))) - ;; (thread-start! th1) (thread-start! th2) - ;; (thread-join! th1) - ;; just do the main stuff in the main thread (runs:run-tests-queue run-id runname test-records keyvals flags test-patts required-tests (any->number reglen) all-tests-registry) (set! keep-going #f) (thread-join! th2) ;; if run-count > 0 call, set -preclean and -rerun STUCK/DEAD @@ -824,12 +846,12 @@ (if (not (hash-table-ref/default flags "-preclean" #f)) (hash-table-set! flags "-preclean" #t)) (if (not (hash-table-ref/default flags "-rerun" #f)) (hash-table-set! flags "-rerun" "ABORT,STUCK/DEAD,n/a,ZERO_ITEMS")) ;; recursive call to self - (runs:run-tests target runname test-patts user flags run-count: (- run-count 1))) - (launch:end-of-run-check run-id))) + (runs:run-tests target runname test-patts user flags run-count: (- run-count 1))) + (launch:end-of-run-check run-id))) (debug:print-info 0 *default-log-port* "No tests to run"))) (debug:print-info 4 *default-log-port* "All done by here") ;; TODO: try putting post hook call here ; (debug:print-info 2 *default-log-port* " run-count " run-count) @@ -1270,11 +1292,11 @@ ((not have-resources) ;; simply try again after waiting a second (if (runs:lownoise "no resources" 60) (debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ...")) ;; Have gone back and forth on this but db starvation is an issue. ;; wait one second before looking again to run jobs. - (thread-sleep! 0.25) + (thread-sleep! 1) ;; changed back to 1 from 0.25 ;; could have done hed tal here but doing car/cdr of newtal to rotate tests (list (car newtal)(cdr newtal) reg reruns)) ;; This is the final stage, everything is in place so launch the test ;; @@ -1529,10 +1551,11 @@ (max-concurrent-jobs (configf:lookup-number *configdat* "setup" "max_concurrent_jobs" default: 50)) (reglen (if (number? reglen-in) reglen-in 1)) (last-time-incomplete (- (current-seconds) 900)) ;; force at least one clean up cycle (last-time-some-running (current-seconds)) ;; (tdbdat (tasks:open-db)) + (misc-data (make-hash-table)) ;; use as needed (runsdat (make-runs:dat ;; hed: hed ;; tal: tal ;; reg: reg ;; reruns: reruns @@ -1589,10 +1612,16 @@ (begin (set! last-time-incomplete (current-seconds)) ;; (rmt:find-and-mark-incomplete-all-runs) )) + ;; WAIT FOR TIME ON TIGHT LOOP + (if (< (- (current-milliseconds)(hash-table-ref/default misc-data "tight-loop-last-time" 0)) + 100) ;; less than 1/100 second since came through the loop + (thread-sleep! 0.1)) ;; wait a 1/100 seconds + (hash-table-set! misc-data "tight-loop-last-time" (current-milliseconds)) + ;; (print "Top of loop, hed=" hed ", tal=" tal " ,reruns=" reruns) (let* ((test-record (hash-table-ref test-records hed)) (test-name (tests:testqueue-get-testname test-record)) (tconfig (tests:testqueue-get-testconfig test-record)) (jobgroup (configf:lookup tconfig "test_meta" "jobgroup")) @@ -1602,11 +1631,11 @@ (priority (tests:testqueue-get-priority test-record)) (itemdat (tests:testqueue-get-itemdat test-record)) ;; itemdat can be a string, list or #f (items (tests:testqueue-get-items test-record)) (item-path (item-list->path itemdat)) (tfullname (db:test-make-full-name test-name item-path)) - ;; these are hard coded item-item waits test/item-path => test/item-path2 ... + ;; these are hard coded item-item waits test/item-path => test/item-path2 ... (extra-waits (let* ((section (configf:get-section (tests:testqueue-get-testconfig test-record) "waitons")) (myextra (alist-ref tfullname section equal?))) (if myextra (let ((extras (string-split (car myextra)))) (if (runs:lownoise (conc tfullname "extra-waitons" tfullname) 60) @@ -1614,11 +1643,11 @@ (for-each (lambda (extra) ;; (debug:print 0 *default-log-port* "FYI: extra = " extra " reruns = " reruns) (let ((basetestname (car (string-split extra "/")))) #;(if (not (member extra tal)) - (set! reruns (append tal (list extra)))) + (set! reruns (append tal (list extra)))) (if (not (member basetestname tal)) (set! reruns (append tal (list basetestname)))) )) extras) extras) @@ -1641,11 +1670,11 @@ newtal: newtal itemmaps: itemmaps ;; prereqs-not-met: prereqs-not-met ))) (runs:dat-regfull-set! runsdat regfull) - + (if (> num-running 0) (set! last-time-some-running (current-seconds))) (if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000))) (hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1))) @@ -1773,10 +1802,13 @@ (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) (- remtries 1))))))) ))))) ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed + (let ((waited (runs:wait-if-seen-recently 5 "prereqs-not-met" hed item-path))) ;; if we've been down this path in the past 5 seconds - wait out the difference + (if (> waited 0)(debug:print 0 *default-log-port* "Waited for prereqs-not-met-"hed"-"item-path" for " waited "seconds."))) + (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)) ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) @@ -1852,11 +1884,11 @@ (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed) ) ) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) - + ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-5") (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this") @@ -1886,11 +1918,11 @@ (debug:print-info 4 *default-log-port* "cond branch - " "rtq-9") (debug:print-info 4 *default-log-port* "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns)) ))) ;; end loop on sorted test names ;; this is the point where everything is launched and now you can mark the run in metadata table as all launched (rmt:set-var (conc "lunch-complete-" run-id) "yes") - + ;; now *if* -run-wait we wait for all tests to be done ;; Now wait for any RUNNING tests to complete (if in run-wait mode) ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) (thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle @@ -1901,29 +1933,31 @@ (equal? (configf:lookup *configdat* "setup" "run-wait") "yes")) (> num-running 0)) (begin ;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes ;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0)) - (if (> (current-seconds)(+ last-time-incomplete 900)) + (if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900) (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id))) - (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id - ". Running as pid " (current-process-id) " on " (get-host-name)) - (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! - (rmt:find-and-mark-incomplete run-id #f) - (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running - " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " - (time->string (seconds->local-time (current-seconds)))))) - ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) - (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1)) - (wait-loop (rmt:get-count-tests-running-for-run-id run-id) - num-running)))) - ;; LET* ((test-record - ;; we get here on "drop through". All done! - ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. - ;; (debug:print-info 0 *default-log-port* "Calling Post Hook") - ;; (runs:run-post-hook run-id) - (debug:print-info 1 *default-log-port* "All tests launched"))) + (let ((actual-num-running num-running)) ;; (rmt:get-count-tests-running-for-run-id run-id))) ;; why call it again? + (debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id + ". Running as pid " (current-process-id) " on " (get-host-name)) + ;; (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set! + (rmt:find-and-mark-incomplete run-id #f) + (hash-table-set! *find-and-mark-incomplete-last-run* run-id (current-seconds)) + (debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running + " tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at " + (time->string (seconds->local-time (current-seconds))))) + ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat))) + (thread-sleep! 5)) ;; (if (>= num-running max-concurrent-jobs) 5 1)) + (wait-loop (rmt:get-count-tests-running-for-run-id run-id) + num-running))))) + ;; LET* ((test-record + ;; we get here on "drop through". All done! + ;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed. + ;; (debug:print-info 0 *default-log-port* "Calling Post Hook") + ;; (runs:run-post-hook run-id) + (debug:print-info 1 *default-log-port* "All tests launched"))) (define (runs:calc-fails prereqs-not-met) (filter (lambda (test) (and (vector? test) ;; not (string? test)) (member (db:test-get-state test) '("INCOMPLETE" "COMPLETED")) ;; TODO: pull from *common:stuff...* Index: server.scm ================================================================== --- server.scm +++ server.scm @@ -378,10 +378,11 @@ (define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last")) ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds) (idletime (configf:lookup-number *configdat* "server" "idletime" default: 4)) (server-key (conc (get-host-name) "-" (current-process-id)))) + ;; (thread-sleep! (/ (random 500) 1000)) ;; I don't think this made a difference (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (delta (- (current-seconds) fmodtime)) (old-enough (> delta idletime)) (new-server-key "") Index: tasks.scm ================================================================== --- tasks.scm +++ tasks.scm @@ -443,23 +443,23 @@ (db:with-db dbstruct #f #t (lambda (db) (sqlite3:execute db (conc "DELETE FROM tasks_queue WHERE id IN (" task-ids ");"))))) -#;(define (tasks:process-queue dbstruct) - (let* ((task (tasks:snag-a-task dbstruct)) - (action (if task (tasks:task-get-action task) #f))) - (if action (print "tasks:process-queue task: " task)) - (if action - (case (string->symbol action) - ((run) (tasks:start-run dbstruct task)) - ((remove) (tasks:remove-runs dbstruct task)) - ((lock) (tasks:lock-runs dbstruct task)) - ;; ((monitor) (tasks:start-monitor db task)) - #;((rollup) (tasks:rollup-runs dbstruct task)) - ((updatemeta)(tasks:update-meta dbstruct task)) - #;((kill) (tasks:kill-monitors dbstruct task)))))) +;; (define (tasks:process-queue dbstruct) +;; (let* ((task (tasks:snag-a-task dbstruct)) +;; (action (if task (tasks:task-get-action task) #f))) +;; (if action (print "tasks:process-queue task: " task)) +;; (if action +;; (case (string->symbol action) +;; ((run) (tasks:start-run dbstruct task)) +;; ((remove) (tasks:remove-runs dbstruct task)) +;; ((lock) (tasks:lock-runs dbstruct task)) +;; ;; ((monitor) (tasks:start-monitor db task)) +;; #;((rollup) (tasks:rollup-runs dbstruct task)) +;; ((updatemeta)(tasks:update-meta dbstruct task)) +;; #;((kill) (tasks:kill-monitors dbstruct task)))))) (define (tasks:tasks->text tasks) (let ((fmtstr "~10a~10a~10a~12a~20a~12a~12a~10a")) (conc (format #f fmtstr "id" "action" "owner" "state" "target" "runname" "testpatts" "params") "\n" (string-intersperse @@ -742,11 +742,11 @@ ;; (define (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time) (let* ((runs-ht (hash-table-ref cached-info 'runs)) (runinf (hash-table-ref/default runs-ht run-id #f)) (area-id (vector-ref area-info 0))) - (if runinf + (if runinf runinf ;; already cached (let* ((run-dat (rmt:get-run-info run-id)) ;; NOTE: get-run-info returns a vector < row header > (run-name (rmt:get-run-name-from-id run-id)) (row (db:get-rows run-dat)) ;; yes, this returns a single row (header (db:get-header run-dat)) @@ -757,64 +757,64 @@ (comment (db:get-value-by-header row header "comment")) (fail-count (db:get-value-by-header row header "fail_count")) (pass-count (db:get-value-by-header row header "pass_count")) (db-contour (db:get-value-by-header row header "contour")) (contour (if (args:get-arg "-prepend-contour") - (if (and db-contour (not (equal? db-contour "")) (string? db-contour )) - (begin + (if (and db-contour (not (equal? db-contour "")) (string? db-contour )) + (begin (debug:print-info 10 *default-log-port* "db-contour" db-contour) - db-contour) - (args:get-arg "-contour")))) + db-contour) + (args:get-arg "-contour")))) (run-tag (if (args:get-arg "-run-tag") (args:get-arg "-run-tag") - "")) + "")) (last-update (db:get-value-by-header row header "last_update")) (keytarg (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) - (conc "MT_CONTOUR/MT_AREA/" (string-intersperse (rmt:get-keys) "/")) (string-intersperse (rmt:get-keys) "/"))) ;; e.g. version/iteration/platform + (conc "MT_CONTOUR/MT_AREA/" (string-intersperse (rmt:get-keys) "/")) (string-intersperse (rmt:get-keys) "/"))) ;; e.g. version/iteration/platform (base-target (rmt:get-target run-id)) (target (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target")) (conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) base-target) base-target)) ;; e.g. v1.63/a3e1/ubuntu (spec-id (pgdb:get-ttype dbh keytarg)) (publish-time (if (args:get-arg "-cp-eventtime-to-publishtime") - event-time - (current-seconds))) + event-time + (current-seconds))) (new-run-id (if (and run-name base-target) (pgdb:get-run-id dbh spec-id target run-name area-id) #f))) - (if new-run-id - (begin ;; let ((run-record (pgdb:get-run-info dbh new-run-id)) - (hash-table-set! runs-ht run-id new-run-id) + (if new-run-id + (begin ;; let ((run-record (pgdb:get-run-info dbh new-run-id)) + (hash-table-set! runs-ht run-id new-run-id) ;; ensure key fields are up to date - ;; if last_update == pgdb_last_update do not update smallest-last-update-time - (let* ((pgdb-last-update (pgdb:get-run-last-update dbh new-run-id)) - (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) - (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) - (hash-table-set! smallest-last-update-time "smallest-time" last-update))) + ;; if last_update == pgdb_last_update do not update smallest-last-update-time + (let* ((pgdb-last-update (pgdb:get-run-last-update dbh new-run-id)) + (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) + (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time))) + (hash-table-set! smallest-last-update-time "smallest-time" last-update))) (pgdb:refresh-run-info dbh new-run-id state status owner event-time comment fail-count pass-count area-id last-update publish-time) (debug:print-info 4 *default-log-port* "Working on run-id " run-id " pgdb-id " new-run-id ) - (if (not (equal? run-tag "")) - (task:add-run-tag dbh new-run-id run-tag)) + (if (not (equal? run-tag "")) + (task:add-run-tag dbh new-run-id run-tag)) new-run-id) - + (if (or (not state) (equal? state "deleted")) - (begin - (debug:print-info 1 *default-log-port* "Warning: Run with id " run-id " was created after previous sync and deleted before the sync") #f) - (if (handle-exceptions - exn - (begin (print-call-chain) - (print ((condition-property-accessor 'exn 'message) exn)) + (begin + (debug:print-info 1 *default-log-port* "Warning: Run with id " run-id " was created after previous sync and deleted before the sync") #f) + (if (handle-exceptions + exn + (begin (print-call-chain) + (print ((condition-property-accessor 'exn 'message) exn)) #f) - - (pgdb:insert-run - dbh - spec-id target run-name state status owner event-time comment fail-count pass-count area-id last-update publish-time)) - (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) - (if (or (not smallest-time) (< last-update smallest-time)) - (hash-table-set! smallest-last-update-time "smallest-time" last-update)) - (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) - #f))))))) + + (pgdb:insert-run + dbh + spec-id target run-name state status owner event-time comment fail-count pass-count area-id last-update publish-time)) + (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f))) + (if (or (not smallest-time) (< last-update smallest-time)) + (hash-table-set! smallest-last-update-time "smallest-time" last-update)) + (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) + #f))))))) (define (task:add-run-tag dbh run-id tag) (let* ((tag-info (pgdb:get-tag-info-by-name dbh tag))) (if (not tag-info) (begin @@ -1018,11 +1018,11 @@ (define (tasks:sync-run-data dbh cached-info run-ids area-info smallest-last-update-time) (for-each (lambda (run-id) (debug:print-info 4 *default-log-port* "Check if run with " run-id " needs to be synced" ) (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)) -run-ids)) + run-ids)) ;; get runs changed since last sync ;; (define (tasks:sync-test-data dbh cached-info area-info) ;; (let* (( Index: tests.scm ================================================================== --- tests.scm +++ tests.scm @@ -1716,10 +1716,27 @@ ;; (map car (sort data (lambda (a b) ;; (> (string->number (caddr a))(string->number (caddr b))))))) ;; )) (sort all-tests sort-fn1)))) ;; avoid dealing with deleted tests, look at the hash table +;; look up all waitons that are related to test "testname" +;; +(define (tests:get-mt-waitons testname flatten) + (let* ((mt-waitons (configf:get-section *configdat* "waitons")) + (my-waitons (filter + (lambda (x) + (string-match (conc "^(" testname "|" testname"/.*)$") (car x))) + mt-waitons))) + (if flatten + (map (lambda (w) + (car (string-split w "/"))) + (apply append (map (lambda (x) + (string-split (cadr x))) + my-waitons))) + my-waitons))) + +;; NOT USED (define (tests:easy-dot test-records outtype) (let-values (((fd temp-path) (file-mkstemp (conc "/tmp/" (current-user-name) ".XXXXXX")))) (let ((all-testnames (hash-table-keys test-records)) (temp-port (open-output-file* fd))) ;; (format temp-port "This file is ~A.~%" temp-path) @@ -1727,15 +1744,17 @@ (format temp-port " size=4,8\n") ;; (format temp-port " splines=none\n") (for-each (lambda (testname) (let* ((testrec (hash-table-ref test-records testname)) - (waitons (or (tests:testqueue-get-waitons testrec) '()))) + (waitons (or (tests:testqueue-get-waitons testrec) '())) + (my-mt-waitons (tests:get-mt-waitons testname #t))) + ;; (print "my-mt-waitons=" my-mt-waitons) (for-each (lambda (waiton) (format temp-port (conc " " waiton " -> " testname " [splines=ortho]\n"))) - waitons))) + (append waitons my-mt-waitons)))) all-testnames) (format temp-port "}\n") (close-output-port temp-port) (with-input-from-pipe (conc "env -i PATH=$PATH dot -T" outtype " < " temp-path) @@ -1760,17 +1779,19 @@ (conc " size=\"" (or sizex 11) "," (or sizey 11) "\";") " ratio=0.95;" ))) (let* ((testrec (hash-table-ref test-records hed)) (waitons (or (tests:testqueue-get-waitons testrec) '())) + (my-mt-waitons (tests:get-mt-waitons hed #t)) + (all-waitons (delete-duplicates (append waitons my-mt-waitons))) (newres (append res - (if (null? waitons) + (if (null? all-waitons) (list (conc " \"" hed "\" [shape=box];")) (map (lambda (waiton) (conc " \"" waiton "\" -> \"" hed "\" [shape=box];")) - waitons) - )))) + all-waitons))))) + ;; (debug:print 0 *default-log-port* "For test "hed" got "all-waitons) (if (null? tal) (append newres (list "}")) (loop (car tal)(cdr tal) newres) )))))) @@ -1788,27 +1809,34 @@ (close-input-port inp) res))) ;; read data from tmp file or create if not exists ;; if exists regen in background +;; mode: raw (return data as read) or munged (convert to list of lists and remove " from strings) ;; -(define (tests:lazy-dot testrecords outtype sizex sizey) +(define (tests:lazy-dot testrecords outtype sizex sizey mode) (let ((dfile (conc "/tmp/." (current-user-name) "-" (server:mk-signature) ".dot")) (fname (conc "/tmp/." (current-user-name) "-" (server:mk-signature) ".dotdat"))) (tests:write-dot-file testrecords dfile sizex sizey) - (if (common:file-exists? fname) - (let ((res (with-input-from-file fname - (lambda () - (read-lines))))) - (system (conc "env -i PATH=$PATH dot -T " outtype " < " dfile " > " fname "&")) - res) - (begin - (system (conc "env -i PATH=$PATH dot -T " outtype " < " dfile " > " fname)) - (with-input-from-file fname - (lambda () - (read-lines))))))) - + (let ((data (if (common:file-exists? fname) + (let ((res (with-input-from-file fname + (lambda () + (read-lines))))) + (system (conc "env -i PATH=$PATH dot -T " outtype " < " dfile " > " fname "&")) + res) + (begin + (system (conc "env -i PATH=$PATH dot -T " outtype " < " dfile " > " fname)) + (with-input-from-file fname + (lambda () + (read-lines))))))) + (if (eq? mode 'raw) + data + (map (lambda (inl) + (map (lambda (s) + (string-substitute "\"" "" s #t)) + (string-split inl))) + data))))) ;; for each test: ;; (define (tests:filter-non-runnable run-id testkeynames testrecordshash) (let ((runnables '())) @@ -1979,28 +2007,52 @@ tdb "SELECT count(id) FROM test_rundat;") res)) 0) -(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname) - (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)) - (if (and cpuload diskfree) - (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) - (if minutes - (rmt:general-call 'update-run-duration run-id minutes test-id)) - (if (and uname hostname) - (rmt:general-call 'update-uname-host run-id uname hostname test-id))) +;; +(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f)) + (if (get-environment-variable "MT_TEST_RUN_DIR") + (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data")) + (or-dash (lambda (instr) + (cond + ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken + ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr)) + (else instr)))) + (file-new (not (directory-exists? dest-dir)))) + (if file-new (create-directory dest-dir #t)) + (let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append))) + (with-output-to-port outp + (lambda () + (if file-new + (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname")) + (print (current-seconds) "," (or-dash run-id) "," (or-dash test-id) "," + (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree) "," + (or-dash minutes) "," (or-dash hostname) "," + (or-dash uname)))) ;; put uname last as it has spaces in it + (close-output-port outp))) + (begin + (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)))) + (if update-db + (begin + (if (and cpuload diskfree) + (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) + (if minutes + (rmt:general-call 'update-run-duration run-id minutes test-id)) + (if (and uname hostname) + (rmt:general-call 'update-uname-host run-id uname hostname test-id))))) ;; This one is for running with no db access (i.e. via rmt: internally) -(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries) +(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f)) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) + (tmpfree (get-df "/tmp")) (uname (get-uname "-srvpio")) (hostname (get-host-name))) - (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname))) + (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree))) ;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) #;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) Index: utils/remrun ================================================================== --- utils/remrun +++ utils/remrun @@ -40,6 +40,12 @@ exit fi export NBFAKE_HOST=$1 shift -exec nbfake $* +cmd="" +for var in $(env | egrep "^(PARENT_|MT_)"|cut -d= -f1);do + new_var="`echo ${!var}`" + cmd="$cmd export $var=$new_var;" +done +cmd="$cmd $*" +exec nbfake $cmd