Index: Makefile
==================================================================
--- Makefile
+++ Makefile
@@ -93,11 +93,12 @@
csc $(CSCOPTS) $(OFILES) dashboard.scm $(GOFILES) $(MOFILES) $(MOIMPFILES) -o dboard
mtut: $(OFILES) $(MOFILES) megatest-fossil-hash.scm mtut.scm megatest-version.scm
csc $(CSCOPTS) $(OFILES) $(MOFILES) mtut.scm -o mtut
-# include makefile.inc
+include makefile.inc
+include chicken.makefile
TCMTOBJS = \
api.o \
archive.o \
cgisetup/models/pgdb.o \
Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,17 @@
# along with Megatest. If not, see .
TODO
====
+WW38
+. Add test_rundat to no-sync ==> correction, put in /.meta/test-run.dat
+. Add STATE/STATUS transitions to .meta/test-run.dat or similar
+. Swizzle update-test-rundat to operate on no-sync
+. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync
+. On state/status change update tests table with duration
+
WW15
. fill newview matrix with data, filter pipeline gui elements
. improve [script], especially indent handling
WW16
Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -154,21 +154,21 @@
((> *api-process-request-count* 20) ;; 20)
(debug:print 0 *default-log-port* "WARNING: api:execute-requests received an overloaded message.")
(set! *server-overloaded* #t)
(vector #f (vector #f 'overloaded))) ;; the inner vector is what gets returned. nope, don't know why. please refactor!
(else
- (let* ((cmd-in (vector-ref dat 0))
+ (let* ((cmd-in (common:safe-vector-ref dat 0 'nocmd))
(cmd (if (symbol? cmd-in)
cmd-in
(string->symbol cmd-in)))
- (params (vector-ref dat 1))
+ (params (common:safe-vector-ref dat 1 '()))
(start-t (current-milliseconds))
(readonly-mode (dbr:dbstruct-read-only dbstruct))
(readonly-command (member cmd api:read-only-queries))
(writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
(foo (begin
- (common:telemetry-log (conc "api-in:"(->string cmd))
+ #;(common:telemetry-log (conc "api-in:"(->string cmd))
payload: `((params . ,params)))
#t))
(res
(if writecmd-in-readonly-mode
@@ -175,10 +175,12 @@
(conc "attempt to run write command "cmd" on a read-only database")
(case cmd
;;===============================================
;; READ/WRITE QUERIES
;;===============================================
+
+ ((nocmd) '(#f "All broken!"))
((get-keys-write) (db:get-keys dbstruct)) ;; force a dummy "write" query to force server; for debug in -repl
;; SERVERS
((start-server) (apply server:kind-run params))
@@ -359,16 +361,16 @@
start-t)))
(hash-table-set! *db-api-call-time* cmd
(cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
(if writecmd-in-readonly-mode
(begin
- (common:telemetry-log (conc "api-out:"(->string cmd))
+ #;(common:telemetry-log (conc "api-out:"(->string cmd))
payload: `((params . ,params)
(ok-res . #t)))
(vector #f res))
(begin
- (common:telemetry-log (conc "api-out:"(->string cmd))
+ #;(common:telemetry-log (conc "api-out:"(->string cmd))
payload: `((params . ,params)
(ok-res . #f)))
(vector #t res))))))))
;; http-server send-response
@@ -381,12 +383,12 @@
(set! *api-process-request-count* (+ *api-process-request-count* 1))
(let* ((cmd ($ 'cmd))
(paramsj ($ 'params))
(params (db:string->obj paramsj transport: 'http)) ;; incoming data from the POST (or is it a GET?)
(resdat (api:execute-requests dbstruct (vector cmd params))) ;; process the request, resdat = #( flag result )
- (success (vector-ref resdat 0))
- (res (vector-ref resdat 1))) ;; (vector flag payload), get the payload, ignore the flag (why?)
+ (success (common:safe-vector-ref resdat 0 #f))
+ (res (common:safe-vector-ref resdat 1 #f))) ;; (vector flag payload), get the payload, ignore the flag (why?)
(if (not success)
(debug:print 0 *default-log-port* "ERROR: success flag is #f for " cmd " with params " params))
(if (> *api-process-request-count* *max-api-process-requests*)
(set! *max-api-process-requests* *api-process-request-count*))
(set! *api-process-request-count* (- *api-process-request-count* 1))
Index: chicken.makefile
==================================================================
--- chicken.makefile
+++ chicken.makefile
@@ -23,11 +23,11 @@
# CHICKEN_BIN_DIR=$(shell dirname $(shell which csi))
# if have csi on path use that, else use default
# CSIPATH=$(shell which csi)
# CKPATH=$(shell dirname $(shell dirname $(CSIPATH)))
-sCHICKEN_PREFIX=$(or $(CKPATH),$(PREFIX)/bin/.$(ARCHSTR))
+CHICKEN_PREFIX=$(or $(CKPATH),$(PREFIX)/bin/.$(ARCHSTR))
whatever :
@echo "CHICKEN_PREFIX=$(CHICKEN_PREFIX)"
tgz-$(USER)/postgresql-9.6.4.tar.gz :
@@ -66,10 +66,11 @@
cd tgz-$(USER)/nanomsg-1.0.0/build-$(USER); make; make install
$(CHICKEN_PREFIX)/chicken-4.13.0/LICENSE : tgz-$(USER)/chicken-4.13.0.tar.gz
mkdir -p build-$(USER)/eggs-installed
cd build-$(USER);tar xf ../tgz-$(USER)/chicken-4.13.0.tar.gz
+ if [[ -e $(CHICKEN_PREFIX)/chicken-4.13.0/LICENSE ]];then touch $(CHICKEN_PREFIX)/chicken-4.13.0/LICENSE;fi
tgz-$(USER)/opensrc.fossil :
cd tgz-$(USER); fossil clone http://www.kiatoa.com/fossils/opensrc opensrc.fossil
mkdir tgz-$(USER)/opensrc
cd tgz-$(USER)/opensrc; fossil open --nested ../opensrc.fossil; fossil up; fossil uv sync
@@ -112,26 +113,40 @@
chicken-uninstall csc csi feathers nanocat sqlite3 vacuumdb logpro \
refdb
CKBIN_WRAPPERS=$(addprefix $(PREFIX)/bin/,$(ALL_CKBIN))
-$(PREFIX)/bin/% : $(CHICKEN_PREFIX)/bin/% $(CHICKEN_PREFIX)/bin/csi $(EGGSTARG2)
+$(PREFIX)/bin/% : $(CHICKEN_PREFIX)/bin/% $(CHICKEN_PREFIX)/bin/csi
utils/mk_wrapper_tool $(PREFIX) $* $(PREFIX)/bin/$*
chmod a+x $(PREFIX)/bin/$*
$(PREFIX)/bin :
mkdir -p $(PREFIX)/bin $(CHICKEN_PREFIX)/bin
-chicken : $(PREFIX)/bin $(CHICKEN_PREFIX)/bin/csi binwrappers
+# For the future - binwrappers
+chicken : $(PREFIX)/bin $(CHICKEN_PREFIX)/bin/csi postgresql.done nanomsg.done iup.done canvas-draw.done sqlite3.done sql-de-lite.done dbi.done $(EGGSTARG2)
@echo "Fake target to build prefix chicken"
binwrappers : $(CKBIN_WRAPPERS)
-postgresql.done : $(CHICKEN_PREFIX)/bin/pg_config
+# make the dep a dummy if not requiring our own build of postgres
+ifeq ($(BUILD_POSTGRES),yes)
+PG_DEP=$(CHICKEN_PREFIX)/bin/pg_config
+else
+PG_DEP=$(CHICKEN_PREFIX)/bin/csi
+endif
+
+postgresql.done : $(PG_DEP)
CSC_OPTIONS="-I$(CHICKEN_PREFIX)/include -L$(CHICKEN_PREFIX)/lib -L$(CHICKEN_PREFIX)/lib64" $(CHICKEN_PREFIX)/bin/chicken-install postgresql > postgresql.done
-nanomsg.done : $(CHICKEN_PREFIX)/lib/libnanomsg.so
+ifeq ($(BUILD_NANOMSG),yes)
+NMSG_DEP=$(CHICKEN_PREFIX)/lib/libnanomsg.so
+else
+NMSG_DEP=$(CHICKEN_PREFIX)/bin/csi
+endif
+
+nanomsg.done : $(NMSG_DEP)
CSC_OPTIONS="-I$(CHICKEN_PREFIX)/include -L$(CHICKEN_PREFIX)/lib -L$(CHICKEN_PREFIX)/lib64" $(CHICKEN_PREFIX)/bin/chicken-install nanomsg > nanomsg.done
iup.done : $(CHICKEN_PREFIX)/lib/libcallback.a
CSC_OPTIONS="-I$(CHICKEN_PREFIX)/include -L$(CHICKEN_PREFIX)/lib" $(CHICKEN_PREFIX)/bin/chicken-install -D no-library-checks -feature disable-iup-web -feature disable-iup-pplot -feature disable-iup-matrixex iup > iup.done
Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -488,11 +488,20 @@
(copy daysfile wksfile)
(copy hrsfile daysfile))
#t)
#f))
-
+(define (common:safe-vector-ref vec indx default)
+ (if (vector? vec)
+ (handle-exceptions
+ exn
+ (begin
+ (debug:print-info 0 *default-log-port* "remote data issue: exn=" exn)
+ default)
+ (vector-ref vec indx))
+ default))
+
;; Rotate logs, logic:
;; if > 500k and older than 1 week:
;; remove previous compressed log and compress this log
;; WARNING: This proc operates assuming that it is in the directory above the
@@ -2707,10 +2716,39 @@
(hash-table-for-each
vars
(lambda (var val)
(setenv var val)))
vars))
+
+(define (common:propogate-mt-vars-to-subrun proc propogate-vars)
+ (let ((vars (make-hash-table))
+ (var-patt "^MT_.*"))
+ (for-each
+ (lambda (vardat) ;; each env var
+ ;(for-each
+ ;(lambda (var-patt)
+ (if (string-match var-patt (car vardat))
+ (let ((var (car vardat))
+ (val (cdr vardat)))
+ (hash-table-set! vars var val)
+ (if (member var propogate-vars)
+ (begin
+ (print var " " (string-substitute "MT_" "PARENT_" var))
+ (setenv (string-substitute "MT_" "PARENT_" var) val)))
+ (unsetenv var))))
+; var-patts))
+ (get-environment-variables))
+ (cond
+ ((string? proc)(system proc))
+ (proc (proc)))
+ (hash-table-for-each
+ vars
+ (lambda (var val)
+ (if (member var propogate-vars)
+ (unsetenv (string-substitute "MT_" "PARENT_" var)))
+ (setenv var val)))
+ vars))
(define (common:run-a-command cmd #!key (with-vars #f) (with-orig-env #f))
(let* ((pre-cmd (dtests:get-pre-command))
(post-cmd (dtests:get-post-command))
Index: configf.scm
==================================================================
--- configf.scm
+++ configf.scm
@@ -781,11 +781,11 @@
;; if
(define (configf:read-alist fname)
(handle-exceptions
exn
(begin
- (debug:print 0 *default-log-port* "read of alist " fname " failed. exn=" exn)
+ (debug:print-info 0 *default-log-port* "unable to read alist " fname ". exn=" exn)
#f)
(configf:alist->config
(with-input-from-file fname read))))
(define (configf:write-alist cdat fname)
Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -1900,27 +1900,42 @@
;; (set! colnum (+ colnum 1))
))))
run-ids)))
(define (dashboard:tests-ht->tests-dat tests-ht)
- (reverse
- (sort
- (hash-table-values tests-ht)
- (lambda (a b)
- (let ((a-test-name (db:test-get-testname a))
- (a-item-path (db:test-get-item-path a))
- (b-test-name (db:test-get-testname b))
- (b-item-path (db:test-get-item-path b))
- (a-event-time (db:test-get-event_time a))
- (b-event-time (db:test-get-event_time b)))
- (if (not (equal? a-test-name b-test-name))
- (> a-event-time b-event-time)
- (cond
- ((< 0 (string-compare3 a-test-name b-test-name)) #t)
- ((> 0 (string-compare3 a-test-name b-test-name)) #f)
- ((< 0 (string-compare3 a-item-path b-item-path)) #t)
- (else #f))))))))
+ (let ((oldest-item (make-hash-table))) ;;
+ ;; populate the oldest-item table
+ (for-each
+ (lambda (tdat)
+ (let ((tname (db:test-get-testname tdat))
+ (etime (db:test-get-event_time tdat)))
+ (if (hash-table-exists? oldest-item tname)
+ (if (< (hash-table-ref oldest-item tname) etime)
+ (hash-table-set! oldest-item tname etime))
+ (hash-table-set! oldest-item tname etime))))
+ (hash-table-values tests-ht))
+ (reverse
+ (sort
+ (hash-table-values tests-ht)
+ (lambda (a b)
+ (let ((a-test-name (db:test-get-testname a))
+ (a-item-path (db:test-get-item-path a))
+ (b-test-name (db:test-get-testname b))
+ (b-item-path (db:test-get-item-path b))
+ (a-event-time (db:test-get-event_time a))
+ (b-event-time (db:test-get-event_time b)))
+ (if (equal? a-test-name b-test-name)
+ (> a-event-time b-event-time)
+ (> (hash-table-ref oldest-item a-test-name)
+ (hash-table-ref oldest-item b-test-name)))))))))
+;; (if (not (equal? a-test-name b-test-name))
+;; (> a-event-time b-event-time)
+;; (cond
+;; ((< 0 (string-compare3 a-test-name b-test-name)) #t)
+;; ((> 0 (string-compare3 a-test-name b-test-name)) #f)
+;; ((< 0 (string-compare3 a-item-path b-item-path)) #t)
+;; (else #f)))))))))
(define (dashboard:run-id->tests-mindat run-id tabdat runs-hash)
(let* ((run (hash-table-ref/default runs-hash run-id #f))
(key-vals (rmt:get-key-vals run-id))
@@ -2015,10 +2030,12 @@
(iup:attribute-set! run-matrix "NUMCOL" max-col ))
(let ((effective-max-row (if (< max-row max-visible) max-visible max-row)))
(if (> effective-max-row (string->number (iup:attribute run-matrix "NUMLIN")))
(iup:attribute-set! run-matrix "NUMLIN" effective-max-row )))
+
+ (iup:attribute-set! run-matrix "WIDTHDEF" 16)
;; Row labels
(for-each (lambda (ind)
(let* ((name (car ind))
(num (cadr ind))
@@ -2054,20 +2071,20 @@
(iup:attribute-set! run-matrix key (cadr value))
(iup:attribute-set! run-matrix (conc "BGCOLOR" key) (car value))))))
matrix-content)
;; Col labels - do after setting Cell contents so they are accounted for in the size calc.
-
+
(for-each (lambda (ind)
(let* ((name (car ind))
(num (cadr ind))
(key (conc "0:" num)))
(if (not (equal? (iup:attribute run-matrix key) name))
(begin
(set! changed #t)
- (iup:attribute-set! run-matrix key name)
- (if (<= num max-col)
+ (iup:attribute-set! run-matrix key name) ;; (list->string (intersperse (string->list name) #\newline))) ;; name)
+ #;(if (<= num max-col)
(iup:attribute-set! run-matrix "FITTOTEXT" (conc "C" num)))))))
col-indices)
(if (and (eq? pass-num 0) changed)
(loop 1 #t)) ;; force second pass due to column labels changing
Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1776,10 +1776,33 @@
#f
)
(with-input-from-file infile read-lines)
)))
+;; check duration against test-run.dat file if it exists and update the value in
+;; the db if necessary
+;;
+(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
+ (let* ((datf (conc run-dir ".mt_data/test-run.dat"))
+ (modt (if (and (file-exists? datf)
+ (file-read-access? datf))
+ (file-modification-time datf)
+ #f)) ;; (+ event-time run-duration))))
+ (alt-run-duration (if modt
+ (- modt event-time)
+ #f)))
+ (if (and alt-run-duration
+ (> alt-run-duration run-duration))
+ (begin
+ (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
+ (db:with-db
+ dbstruct #f #f
+ (lambda (db)
+ (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
+ #t)))
+ #f))) ;; #f = we did NOT adjust the time
+
;; select end_time-now from
;; (select testname,item_path,event_time+run_duration as
;; end_time,strftime('%s','now') as now from tests where state in
;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
@@ -1828,37 +1851,39 @@
;; (db:test-get-run_duration testdat)))
;; 600)
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
- (if (and (equal? uname "n/a")
- (equal? item-path "")) ;; this is a toplevel test
- ;; what to do with toplevel? call rollup?
- (begin
- (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
- (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
- (begin
- (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
- (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
- test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
- " event-time="event-time" run-duration="run-duration))))
+ (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+ (if (and (equal? uname "n/a")
+ (equal? item-path "")) ;; this is a toplevel test
+ ;; what to do with toplevel? call rollup?
+ (begin
+ (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+ (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+ (begin
+ (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
+ (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
+ test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
+ " event-time="event-time" run-duration="run-duration)))))
stmth1
run-id running-deadtime) ;; default time 720 seconds
-
+
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
- (if (and (equal? uname "n/a")
- (equal? item-path "")) ;; this is a toplevel test
- ;; what to do with toplevel? call rollup?
- (begin
- (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
- (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
- (begin
- (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
- " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
- " run-duration="run-duration)
- (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
+ (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+ (if (and (equal? uname "n/a")
+ (equal? item-path "")) ;; this is a toplevel test
+ ;; what to do with toplevel? call rollup?
+ (begin
+ (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+ (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+ (begin
+ (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
+ " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
+ " run-duration="run-duration)
+ (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))))
stmth2
run-id remotehoststart-deadtime) ;; default time 230 seconds
;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
;;
@@ -3215,21 +3240,19 @@
test-id))))))
(mt:process-triggers dbstruct run-id test-id newstate newstatus))
;; NEW BEHAVIOR: Count tests running in all runs!
;;
-(define (db:get-count-tests-running dbstruct run-id fastmode)
- (let* ((qry (if fastmode
- "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '') LIMIT 1;"
- "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');")))
- (db:with-db
- dbstruct
- run-id
- #f
- (lambda (db)
- (let* ((stmth (db:get-cache-stmth dbstruct db qry)))
- (sqlite3:first-result stmth))))))
+(define (db:get-count-tests-running dbstruct run-id)
+ (let* ((qry "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND NOT (uname = 'n/a' AND item_path = '');"))
+ (db:with-db
+ dbstruct
+ run-id
+ #f
+ (lambda (db)
+ (let* ((stmth (db:get-cache-stmth dbstruct db qry)))
+ (sqlite3:first-result stmth))))))
;; NEW BEHAVIOR: Count tests running in only one run!
;;
(define (db:get-count-tests-actually-running dbstruct run-id)
(db:with-db
@@ -3245,21 +3268,23 @@
run-id)))) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');")
;; NEW BEHAVIOR: Look only at single run with run-id
;;
;; (define (db:get-running-stats dbstruct run-id)
-(define (db:get-count-tests-running-for-run-id dbstruct run-id fastmode)
- (let* ((qry (if fastmode
- "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? LIMIT 1;"
- "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;")))
+(define (db:get-count-tests-running-for-run-id dbstruct run-id)
+ (let* ((qry "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;"))
(db:with-db
dbstruct
run-id
#f
(lambda (db)
(let* ((stmth (db:get-cache-stmth dbstruct db qry)))
- (sqlite3:first-result stmth run-id))))))
+ (sqlite3:fold-row
+ (lambda (res val) val)
+ 0 stmth run-id))))))
+
+;; (sqlite3:first-result stmth run-id))))))
;; For a given testname how many items are running? Used to determine
;; probability for regenerating html
;;
(define (db:get-count-tests-running-for-testname dbstruct run-id testname)
@@ -3268,23 +3293,23 @@
run-id
#f
(lambda (db)
(let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")
(stmth (db:get-cache-stmth dbstruct db stmt)))
- (sqlite3:first-result
- stmth run-id testname)))))
+ (sqlite3:fold-row
+ (lambda (res val) val) 0 stmth run-id testname)))))
(define (db:get-not-completed-cnt dbstruct run-id)
-(db:with-db
+ (db:with-db
dbstruct
run-id
#f
(lambda (db)
- ;(print "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=" run-id)
- (sqlite3:first-result
- db
- "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;" run-id))))
+ (let* ((stmt "SELECT count(id) FROM tests WHERE state not in ('COMPLETED', 'DELETED') AND run_id=?;"))
+ (sqlite3:fold-row
+ (lambda (res val) val)
+ 0 (db:get-cache-stmth dbstruct db stmt) run-id)))))
(define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup)
(if (not jobgroup)
0 ;;
(let ((testnames '()))
@@ -3467,11 +3492,11 @@
(lambda (run-id)
(let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
(db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
run-ids)))
-;; Get test data using test_id, run-id is not used
+;; Get test data using test_id, run-id is not used - but it will be!
;;
(define (db:get-test-info-by-id dbstruct run-id test-id)
(db:with-db
dbstruct
#f ;; run-id
Index: dcommon.scm
==================================================================
--- dcommon.scm
+++ dcommon.scm
@@ -953,14 +953,13 @@
(+ yoffset (* y scalef)))
;; sizex, sizey - canvas size
;; originx, originy - canvas origin
;;
-(define (dcommon:initial-draw-tests cnv xadj yadj sizex sizey sizexmm sizeymm originx originy tests-draw-state sorted-testnames test-records)
- (let* ((dot-data ;; (map cdr (filter
- ;; (lambda (x)(equal? "node" (car x)))
- (map string-split (tests:lazy-dot test-records "plain" sizex sizey))) ;; (tests:easy-dot test-records "plain")))
+(define (dcommon:initial-draw-tests cnv xadj yadj sizex sizey sizexmm sizeymm originx originy
+ tests-draw-state sorted-testnames test-records)
+ (let* ((dot-data (tests:lazy-dot test-records "plain" sizex sizey 'munged))
(xoffset (dcommon:get-xoffset tests-draw-state sizex xadj))
(yoffset (dcommon:get-yoffset tests-draw-state sizey yadj))
(no-dot (configf:lookup *configdat* "setup" "nodot"))
(boxh 15)
(boxw 10)
Index: ezsteps.scm
==================================================================
--- ezsteps.scm
+++ ezsteps.scm
@@ -105,11 +105,11 @@
(let ((proc (lambda ()
(set! pid (process-run "/bin/bash" (list "-c" cmd))))))
(if subrun
(begin
(debug:print-info 0 *default-log-port* "Running without MT_.* environment variables.")
- (common:without-vars proc "^MT_.*"))
+ (common:propogate-mt-vars-to-subrun proc '("MT_TARGET" "MT_LINKTREE" "MT_RUNNAME")))
(proc)))
(with-output-to-file "Makefile.ezsteps"
(lambda ()
(print stepname ".log :")
Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -205,17 +205,19 @@
(current-seconds)
start-seconds)))))
(kill-tries 0))
;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
- (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+ (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)
(let loop ((minutes (calc-minutes))
(cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(disk-free (get-df (current-directory)))
(last-sync (current-seconds)))
- (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
+ ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
+ ;; top of loop encountered at "(current-seconds)" with
+ ;; last-sync="last-sync))
(let* ((over-time (> (current-seconds) (+ last-sync update-period)))
(new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(delta (abs (- load cpu-load))))
(if (> delta 0.1) ;; don't bother updating with small changes
load
@@ -233,33 +235,28 @@
(test-info (rmt:get-test-info-by-id run-id test-id))
(state (db:test-get-state test-info))
(status (db:test-get-status test-info))
(kill-reason "no kill reason specified")
(kill-job? #f))
- (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+ #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
((test-get-kill-request run-id test-id)
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
((equal? status "DEAD")
- (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
(rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
(set! kill-job? #f)))
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(launch:handle-zombie-tests run-id)
- (when do-sync
- ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
- ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
- (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
- (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
- (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
-
+ (if do-sync ;; save meta data about the running of this test
+ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
(if kill-job?
(begin
(debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
(mutex-lock! m)
;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
@@ -312,11 +309,11 @@
(if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(loop (calc-minutes)
(or new-cpu-load cpu-load)
(or new-disk-free disk-free)
(if do-sync (current-seconds) last-sync)))))))
- (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+ (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional
(define (launch:execute encoded-cmd)
(let* ((cmdinfo (common:read-encoded-string encoded-cmd))
(tconfigreg #f))
@@ -465,10 +462,13 @@
(debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
(exit))))
(test-pid (db:test-get-process_id test-info)))
(cond
;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
+ ;;((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+ ;; (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked
+ ;; (member (db:test-get-status test-info) '("ABORT"))))
((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
(debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
(rmt:general-call 'set-test-start-time #f test-id)
@@ -731,11 +731,11 @@
;; > 0 RUNNING and test_dead then send KILLREQ ==> COMPLETED
;; 0 RUNNING ==> this is actually the first condition, should not get here
(define (launch:end-of-run-check run-id )
(let* ((not-completed-cnt (rmt:get-not-completed-cnt run-id))
- (running-cnt (rmt:get-count-tests-running-for-run-id run-id #f)) ;; fastmode=no
+ (running-cnt (rmt:get-count-tests-running-for-run-id run-id))
(all-test-launched (rmt:get-var (conc "lunch-complete-" run-id)))
(current-state (rmt:get-run-state run-id))
(current-status (rmt:get-run-status run-id)))
;;get-vars run-id to query metadata table to check if all completed. if all-test-launched = yes then only not-completed-cnt = 0 means everyting is completed if no entry found in the table do nothing
(debug:print 0 *default-log-port* "Running test cnt :" running-cnt)
@@ -771,11 +771,13 @@
(if (not (null? tal))
(loop (car tal) (cdr tal)))))))))))
(define (launch:is-test-alive host pid)
(if (and host pid (not (equal? host "n/a")))
- (let* ((cmd (conc "ssh " host " pstree -A " pid))
+ (let* ((is-local (equal? host (get-host-name)))
+ (ssh-cmd (if is-local " " (conc "ssh " host " ")))
+ (cmd (conc ssh-cmd "pstree -A " pid))
(output (with-input-from-pipe cmd read-lines)))
(debug:print 2 *default-log-port* "Running " cmd " received " output)
(if (eq? (length output) 0)
#f
#t))
Index: megatest.scm
==================================================================
--- megatest.scm
+++ megatest.scm
@@ -523,10 +523,11 @@
"-show-cmdinfo"
"-cleanup-db"))
(no-watchdog-args-vals (filter (lambda (x) x)
(map args:get-arg no-watchdog-args)))
(start-watchdog (null? no-watchdog-args-vals)))
+ ;;(print "no-watchdog-args="no-watchdog-args "no-watchdog-args-vals="no-watchdog-args-vals " start-watchdog-specail-arg-val:" start-watchdog-specail-arg-val " start-watchdog:" start-watchdog)
;;(BB> "no-watchdog-args="no-watchdog-args "no-watchdog-args-vals="no-watchdog-args-vals)
(if start-watchdog
(thread-start! *watchdog*)))
Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -54,18 +54,39 @@
(client:setup areapath)
#f))))
(define *send-receive-mutex* (make-mutex)) ;; should have separate mutex per run-id
+(define *rmt-query-last-call-time* 0)
+(define *rmt-query-last-rest-time* 0) ;; last time there was at least a 1/2 second rest - giving other processes access to the db
+
+;; NOTE: This query rest algorythm will not adapt to long query times. REDESIGN NEEDED. TODO. FIXME.
+;;
+(define (rmt:query-rest cmd rid params)
+ (let* ((now (current-milliseconds)))
+ (cond
+ ((> (- now *rmt-query-last-call-time*) 500) ;; it's been a while since last query - no need to rest
+ (set! *rmt-query-last-rest-time* now)
+ (set! *rmt-query-last-call-time* now))
+ ((> (- now *rmt-query-last-rest-time*) 5000) ;; no natural rests have happened
+ (debug:print 0 *default-log-port* "query rest needed. blocking for 1/2 second. cmd="cmd", run id="rid", params="params)
+ (thread-sleep! 0.5) ;; force a rest of a half second
+ (set! *rmt-query-last-rest-time* now)
+ (set! *rmt-query-last-call-time* now))
+ (else ;; sufficient rests have occurred, just record the last query time
+ (set! *rmt-query-last-call-time* now)))))
+
;; RA => e.g. usage (rmt:send-receive 'get-var #f (list varname))
;;
(define (rmt:send-receive cmd rid params #!key (attemptnum 1)(area-dat #f)) ;; start attemptnum at 1 so the modulo below works as expected
#;(common:telemetry-log (conc "rmt:"(->string cmd))
payload: `((rid . ,rid)
(params . ,params)))
-
+ (if (not (equal? (configf:lookup *configdat* "setup" "query-rest") "no"))
+ (rmt:query-rest cmd rid params))
+
(if (> attemptnum 2)
(debug:print 0 *default-log-port* "INFO: attemptnum in rmt:send-receive is " attemptnum))
(cond
((> attemptnum 2) (thread-sleep! 0.05))
@@ -369,12 +390,12 @@
(> (vector-length v) 1))
(let ((newvec (vector (vector-ref v 0)(vector-ref v 1))))
newvec) ;; by copying the vector while inside the error handler we should force the detection of a corrupted record
(vector #t '())))) ;; we could also check that the returned types are valid
(vector #t '())))
- (success (vector-ref resdat 0))
- (res (vector-ref resdat 1))
+ (success (common:safe-vector-ref resdat 0 #f)) ;; (vector-ref resdat 0))
+ (res (common:safe-vector-ref resdat 1 #f)) ;; (vector-ref resdat 1))
(duration (- (current-milliseconds) start)))
(if (and read-only qry-is-write)
(debug:print 0 *default-log-port* "ERROR: attempt to write to read-only database ignored. cmd=" cmd))
(if (not success)
(if (> remretries 0)
@@ -389,11 +410,11 @@
;; (rmt:update-db-stats run-id cmd params duration)
;; mark this run as dirty if this was a write, the watchdog is responsible for syncing it
(if qry-is-write
(let ((start-time (current-seconds)))
(mutex-lock! *db-multi-sync-mutex*)
-/ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client)
+ (set! *db-last-access* start-time) ;; THIS IS PROBABLY USELESS? (we are on a client)
(mutex-unlock! *db-multi-sync-mutex*)))))
res))
(define (rmt:send-receive-no-auto-client-setup connection-info cmd run-id params)
(let* ((run-id (if run-id run-id 0))
@@ -525,15 +546,32 @@
(rmt:general-call 'register-test run-id run-id test-name item-path))
(define (rmt:get-test-id run-id testname item-path)
(rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
-;; run-id is NOT used
+;; run-id is NOT used - but it will be!
;;
(define (rmt:get-test-info-by-id run-id test-id)
(if (number? test-id)
- (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
+ (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)))
+ (trundir (vector-ref testdat 10))
+ (trundatf (conc trundir"/.mt_data/test-run.dat")))
+ ;; now we can update a couple fields from the filesystem
+ (handle-exceptions
+ exn
+ (begin
+ (debug:print-info 0 *default-log-port* "Could not update testdat record from "trundatf", exn=" exn)
+ #f)
+ (if (and trundir
+ (file-exists? trundatf))
+ (let* ((duration (vector-ref testdat 12)) ;; (db:test-get-run_duration testdat))
+ (event-time (vector-ref testdat 5)) ;; (db:test-get-event_time testdat))
+ (last-touch (file-modification-time trundatf))
+ (new-duration (max duration (- last-touch event-time))))
+ (vector-set! testdat 12 new-duration))))
+ #;(db:test-set-run_duration! testdat (max duration (- last-touch event-time)))
+ testdat)
(begin
(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
(print-call-chain (current-error-port))
#f)))
@@ -661,21 +699,23 @@
run-ids))))
(define (rmt:get-prereqs-not-met run-id waitons ref-test-name ref-item-path #!key (mode '(normal))(itemmaps #f))
(rmt:send-receive 'get-prereqs-not-met run-id (list run-id waitons ref-test-name ref-item-path mode itemmaps)))
-(define (rmt:get-count-tests-running-for-run-id run-id fastmode)
- (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id fastmode)))
+(define (rmt:get-count-tests-running-for-run-id run-id)
+ (if (number? run-id)
+ (rmt:send-receive 'get-count-tests-running-for-run-id run-id (list run-id))
+ 0))
(define (rmt:get-not-completed-cnt run-id)
(rmt:send-receive 'get-not-completed-cnt run-id (list run-id)))
;; Statistical queries
-(define (rmt:get-count-tests-running run-id fastmode)
- (rmt:send-receive 'get-count-tests-running run-id (list run-id fastmode)))
+(define (rmt:get-count-tests-running run-id)
+ (rmt:send-receive 'get-count-tests-running run-id (list run-id)))
(define (rmt:get-count-tests-running-for-testname run-id testname)
(rmt:send-receive 'get-count-tests-running-for-testname run-id (list run-id testname)))
(define (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)
Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -61,11 +61,38 @@
(defstruct runs:testdat
hed tal reg reruns test-record
test-name item-path jobgroup
waitons testmode newtal itemmaps prereqs-not-met)
+
+(module runsmod
+ (
+ runs:wait-if-seen-recently
+ )
+(import scheme chicken data-structures extras files)
+(import posix typed-records srfi-18 srfi-69
+ md5 message-digest
+ regex srfi-1)
+
+(define *last-seen-ht* (make-hash-table))
+
+(define (runs:wait-if-seen-recently wait-until . keys)
+ (let* ((full-key (string-intersperse keys "-"))
+ (last-seen (hash-table-ref/default *last-seen-ht* full-key 0))
+ (now (current-seconds))
+ (delta (- now last-seen))
+ (needed (if (< delta wait-until)
+ 0
+ (- wait-until delta))))
+ (if (> needed 0)(thread-sleep! needed))
+ (hash-table-set! *last-seen-ht* full-key (current-seconds))
+ needed))
+)
+
+(import runsmod)
+
;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files
;; - remove any that are over 3600 seconds old
;; - if there are any that are younger than 10 seconds
;; * sleep 10 seconds
;; * touch my key-host-pid.softlock file
@@ -321,11 +348,11 @@
(args:get-arg "-one-pass"))
(exit 0))
(if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
- (let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
+ (let* ((num-running (rmt:get-count-tests-running run-id))
(num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
(job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
(if (string? jobg-count)
(string->number jobg-count)
jobg-count))))
@@ -435,10 +462,12 @@
(debug:print-info 0 *default-log-port* "post-hook \"" run-post-hook "\" took " (- (current-seconds) start-time) " seconds to run."))))))
;; return #t when all items in waitors-upon list are represented in test-patt, #f otherwise.
(define (runs:testpatts-mention-waitors-upon? test-patt waitors-upon)
(null? (tests:filter-test-names-not-matched waitors-upon test-patt)))
+
+(define *find-and-mark-incomplete-last-run* (make-hash-table))
;;======================================================================
;; runs:run-tests is called from megatest.scm and itself
;;======================================================================
;;
@@ -745,13 +774,18 @@
;; (rmt:find-and-mark-incomplete-all-runs))))) CAN'T INTERRUPT IT ...
(let ((run-ids (rmt:get-all-run-ids)))
(for-each (lambda (run-id)
(if keep-going
(handle-exceptions
- exn
- (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
- (rmt:find-and-mark-incomplete run-id #f)))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
+ exn
+ (debug:print 0 *default-log-port* "error in calling find-and-mark-incomplete for run-id " run-id ", exn=" exn)
+ ;; lets run this only if a run has been NOT seen for more than 900 seconds
+ (if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900)
+ (begin
+ (rmt:find-and-mark-incomplete run-id #f)
+ (hash-table-set! *find-and-mark-incomplete-last-run* run-id (current-seconds)))
+ )))) ;; ovr-deadtime))) ;; could be root of https://hsdes.intel.com/appstore/article/#/220546828/main -- Title: Megatest jobs show DEAD even though they are still running (1.64/27)
run-ids)))
"runs: mark-incompletes")))
;; (thread-start! th1)
(thread-start! th2)
;; (thread-join! th1)
@@ -1211,11 +1245,11 @@
((not have-resources) ;; simply try again after waiting a second
(if (runs:lownoise "no resources" 60)
(debug:print-info 1 *default-log-port* "no resources to run new tests, waiting ..."))
;; Have gone back and forth on this but db starvation is an issue.
;; wait one second before looking again to run jobs.
- (thread-sleep! 0.25)
+ (thread-sleep! 1) ;; changed back to 1 from 0.25
;; could have done hed tal here but doing car/cdr of newtal to rotate tests
(list (car newtal)(cdr newtal) reg reruns))
;; This is the final stage, everything is in place so launch the test
;;
@@ -1565,11 +1599,11 @@
extras)
'())))
(waitons (delete-duplicates (append (tests:testqueue-get-waitons test-record) extra-waits) equal?))
(newtal (append tal (list hed)))
(regfull (>= (length reg) reglen))
- (num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
+ (num-running (rmt:get-count-tests-running-for-run-id run-id))
(testdat (make-runs:testdat
hed: hed
tal: tal
reg: reg
reruns: reruns
@@ -1715,10 +1749,13 @@
(loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
(- remtries 1)))))))
)))))
;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
+ (let ((waited (runs:wait-if-seen-recently 5 "prereqs-not-met" hed item-path))) ;; if we've been down this path in the past 5 seconds - wait out the difference
+ (if (> waited 0)(debug:print 0 *default-log-port* "Waited for prereqs-not-met-"hed"-"item-path" for " waited "seconds.")))
+
(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
(runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
@@ -1831,31 +1868,33 @@
;; now *if* -run-wait we wait for all tests to be done
;; Now wait for any RUNNING tests to complete (if in run-wait mode)
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
- (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
+ (let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id))
(prev-num-running 0))
;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
(if (and (or (args:get-arg "-run-wait")
(equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
(> num-running 0))
(begin
;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
- (if (> (current-seconds)(+ last-time-incomplete 900))
- (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no
+ (if (> (- (current-seconds)(hash-table-ref/default *find-and-mark-incomplete-last-run* run-id 0)) 900)
+ ;; (begin(if (> (current-seconds)(+ last-time-incomplete 900))
+ (let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id)))
(debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
". Running as pid " (current-process-id) " on " (get-host-name))
- (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
+ ;; (set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
(rmt:find-and-mark-incomplete run-id #f)
+ (hash-table-set! *find-and-mark-incomplete-last-run* run-id (current-seconds))
(debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
" tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
(time->string (seconds->local-time (current-seconds))))))
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
- (wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
+ (wait-loop (rmt:get-count-tests-running-for-run-id run-id)
num-running))))
;; LET* ((test-record
;; we get here on "drop through". All done!
;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed.
;; (debug:print-info 0 *default-log-port* "Calling Post Hook")
Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -326,10 +326,11 @@
(define (server:wait-for-server-start-last-flag areapath)
(let* ((start-flag (conc areapath "/logs/server-start-last"))
;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
(reftime (configf:lookup-number *configdat* "server" "idletime" default: 4))
(server-key (conc (get-host-name) "-" (current-process-id))))
+ ;; (thread-sleep! (/ (random 500) 1000)) ;; I don't think this made a difference
(if (file-exists? start-flag)
(let* ((fmodtime (file-modification-time start-flag))
(delta (- (current-seconds) fmodtime))
(all-go (> delta reftime)))
(if (and all-go
Index: tasks.scm
==================================================================
--- tasks.scm
+++ tasks.scm
@@ -443,23 +443,23 @@
(db:with-db
dbstruct #f #t
(lambda (db)
(sqlite3:execute db (conc "DELETE FROM tasks_queue WHERE id IN (" task-ids ");")))))
-#;(define (tasks:process-queue dbstruct)
- (let* ((task (tasks:snag-a-task dbstruct))
- (action (if task (tasks:task-get-action task) #f)))
- (if action (print "tasks:process-queue task: " task))
- (if action
- (case (string->symbol action)
- ((run) (tasks:start-run dbstruct task))
- ((remove) (tasks:remove-runs dbstruct task))
- ((lock) (tasks:lock-runs dbstruct task))
- ;; ((monitor) (tasks:start-monitor db task))
- #;((rollup) (tasks:rollup-runs dbstruct task))
- ((updatemeta)(tasks:update-meta dbstruct task))
- #;((kill) (tasks:kill-monitors dbstruct task))))))
+;; (define (tasks:process-queue dbstruct)
+;; (let* ((task (tasks:snag-a-task dbstruct))
+;; (action (if task (tasks:task-get-action task) #f)))
+;; (if action (print "tasks:process-queue task: " task))
+;; (if action
+;; (case (string->symbol action)
+;; ((run) (tasks:start-run dbstruct task))
+;; ((remove) (tasks:remove-runs dbstruct task))
+;; ((lock) (tasks:lock-runs dbstruct task))
+;; ;; ((monitor) (tasks:start-monitor db task))
+;; #;((rollup) (tasks:rollup-runs dbstruct task))
+;; ((updatemeta)(tasks:update-meta dbstruct task))
+;; #;((kill) (tasks:kill-monitors dbstruct task))))))
(define (tasks:tasks->text tasks)
(let ((fmtstr "~10a~10a~10a~12a~20a~12a~12a~10a"))
(conc (format #f fmtstr "id" "action" "owner" "state" "target" "runname" "testpatts" "params") "\n"
(string-intersperse
@@ -742,11 +742,11 @@
;;
(define (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time)
(let* ((runs-ht (hash-table-ref cached-info 'runs))
(runinf (hash-table-ref/default runs-ht run-id #f))
(area-id (vector-ref area-info 0)))
- (if runinf
+ (if runinf
runinf ;; already cached
(let* ((run-dat (rmt:get-run-info run-id)) ;; NOTE: get-run-info returns a vector < row header >
(run-name (rmt:get-run-name-from-id run-id))
(row (db:get-rows run-dat)) ;; yes, this returns a single row
(header (db:get-header run-dat))
@@ -755,65 +755,65 @@
(owner (db:get-value-by-header row header "owner"))
(event-time (db:get-value-by-header row header "event_time"))
(comment (db:get-value-by-header row header "comment"))
(fail-count (db:get-value-by-header row header "fail_count"))
(pass-count (db:get-value-by-header row header "pass_count"))
- (db-contour (db:get-value-by-header row header "contour"))
+ (db-contour (db:get-value-by-header row header "contour"))
(contour (if (args:get-arg "-prepend-contour")
- (if (and db-contour (not (equal? db-contour "")) (string? db-contour ))
- (begin
- (debug:print-info 1 *default-log-port* "db-contour")
- db-contour)
- (args:get-arg "-contour"))))
- (run-tag (if (args:get-arg "-run-tag")
+ (if (and db-contour (not (equal? db-contour "")) (string? db-contour ))
+ (begin
+ (debug:print-info 1 *default-log-port* "db-contour")
+ db-contour)
+ (args:get-arg "-contour"))))
+ (run-tag (if (args:get-arg "-run-tag")
(args:get-arg "-run-tag")
- ""))
- (last-update (db:get-value-by-header row header "last_update"))
+ ""))
+ (last-update (db:get-value-by-header row header "last_update"))
(keytarg (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target"))
- (conc "MT_CONTOUR/MT_AREA/" (string-intersperse (rmt:get-keys) "/")) (string-intersperse (rmt:get-keys) "/"))) ;; e.g. version/iteration/platform
+ (conc "MT_CONTOUR/MT_AREA/" (string-intersperse (rmt:get-keys) "/")) (string-intersperse (rmt:get-keys) "/"))) ;; e.g. version/iteration/platform
(target (if (or (args:get-arg "-prepend-contour") (args:get-arg "-prefix-target"))
- (conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) (rmt:get-target run-id)) (rmt:get-target run-id))) ;; e.g. v1.63/a3e1/ubuntu
+ (conc (or (args:get-arg "-prefix-target") (conc contour "/" (common:get-area-name) "/")) (rmt:get-target run-id)) (rmt:get-target run-id))) ;; e.g. v1.63/a3e1/ubuntu
(spec-id (pgdb:get-ttype dbh keytarg))
(publish-time (if (args:get-arg "-cp-eventtime-to-publishtime")
- event-time
- (current-seconds)))
+ event-time
+ (current-seconds)))
(new-run-id (pgdb:get-run-id dbh spec-id target run-name area-id)))
- (if new-run-id
- (begin ;; let ((run-record (pgdb:get-run-info dbh new-run-id))
- (hash-table-set! runs-ht run-id new-run-id)
+ (if new-run-id
+ (begin ;; let ((run-record (pgdb:get-run-info dbh new-run-id))
+ (hash-table-set! runs-ht run-id new-run-id)
;; ensure key fields are up to date
- ;; if last_update == pgdb_last_update do not update smallest-last-update-time
- (let* ((pgdb-last-update (pgdb:get-run-last-update dbh new-run-id))
- (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)))
- (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time)))
- (hash-table-set! smallest-last-update-time "smallest-time" last-update)))
+ ;; if last_update == pgdb_last_update do not update smallest-last-update-time
+ (let* ((pgdb-last-update (pgdb:get-run-last-update dbh new-run-id))
+ (smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)))
+ (if (and (> last-update pgdb-last-update) (or (not smallest-time) (< last-update smallest-time)))
+ (hash-table-set! smallest-last-update-time "smallest-time" last-update)))
(pgdb:refresh-run-info
dbh
new-run-id
state status owner event-time comment fail-count pass-count area-id last-update publish-time)
- (debug:print-info 0 *default-log-port* "Working on run-id " run-id " pgdb-id " new-run-id )
- (if (not (equal? run-tag ""))
- (task:add-run-tag dbh new-run-id run-tag))
+ (debug:print-info 0 *default-log-port* "Working on run-id " run-id " pgdb-id " new-run-id )
+ (if (not (equal? run-tag ""))
+ (task:add-run-tag dbh new-run-id run-tag))
new-run-id)
-
+
(if (equal? state "deleted")
- (begin
- (debug:print-info 1 *default-log-port* "Warning: Run with id " run-id " was created after previous sync and deleted before the sync") #f)
- (if (handle-exceptions
- exn
- (begin (print-call-chain)
- (print ((condition-property-accessor 'exn 'message) exn))
+ (begin
+ (debug:print-info 1 *default-log-port* "Warning: Run with id " run-id " was created after previous sync and deleted before the sync") #f)
+ (if (handle-exceptions
+ exn
+ (begin (print-call-chain)
+ (print ((condition-property-accessor 'exn 'message) exn))
#f)
-
- (pgdb:insert-run
- dbh
- spec-id target run-name state status owner event-time comment fail-count pass-count area-id last-update publish-time))
- (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)))
- (if (or (not smallest-time) (< last-update smallest-time))
- (hash-table-set! smallest-last-update-time "smallest-time" last-update))
- (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time))
- #f)))))))
+
+ (pgdb:insert-run
+ dbh
+ spec-id target run-name state status owner event-time comment fail-count pass-count area-id last-update publish-time))
+ (let* ((smallest-time (hash-table-ref/default smallest-last-update-time "smallest-time" #f)))
+ (if (or (not smallest-time) (< last-update smallest-time))
+ (hash-table-set! smallest-last-update-time "smallest-time" last-update))
+ (tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time))
+ #f)))))))
(define (task:add-run-tag dbh run-id tag)
(let* ((tag-info (pgdb:get-tag-info-by-name dbh tag)))
(if (not tag-info)
(begin
@@ -1015,11 +1015,11 @@
(define (tasks:sync-run-data dbh cached-info run-ids area-info smallest-last-update-time)
(for-each
(lambda (run-id)
(debug:print-info 1 *default-log-port* "Check if run with " run-id " needs to be synced" )
(tasks:run-id->mtpg-run-id dbh cached-info run-id area-info smallest-last-update-time))
-run-ids))
+ run-ids))
;; get runs changed since last sync
;; (define (tasks:sync-test-data dbh cached-info area-info)
;; (let* ((
Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1701,10 +1701,27 @@
;; (map car (sort data (lambda (a b)
;; (> (string->number (caddr a))(string->number (caddr b)))))))
;; ))
(sort all-tests sort-fn1)))) ;; avoid dealing with deleted tests, look at the hash table
+;; look up all waitons that are related to test "testname"
+;;
+(define (tests:get-mt-waitons testname flatten)
+ (let* ((mt-waitons (configf:get-section *configdat* "waitons"))
+ (my-waitons (filter
+ (lambda (x)
+ (string-match (conc "^(" testname "|" testname"/.*)$") (car x)))
+ mt-waitons)))
+ (if flatten
+ (map (lambda (w)
+ (car (string-split w "/")))
+ (apply append (map (lambda (x)
+ (string-split (cadr x)))
+ my-waitons)))
+ my-waitons)))
+
+;; NOT USED
(define (tests:easy-dot test-records outtype)
(let-values (((fd temp-path) (file-mkstemp (conc "/tmp/" (current-user-name) ".XXXXXX"))))
(let ((all-testnames (hash-table-keys test-records))
(temp-port (open-output-file* fd)))
;; (format temp-port "This file is ~A.~%" temp-path)
@@ -1712,15 +1729,17 @@
(format temp-port " size=4,8\n")
;; (format temp-port " splines=none\n")
(for-each
(lambda (testname)
(let* ((testrec (hash-table-ref test-records testname))
- (waitons (or (tests:testqueue-get-waitons testrec) '())))
+ (waitons (or (tests:testqueue-get-waitons testrec) '()))
+ (my-mt-waitons (tests:get-mt-waitons testname #t)))
+ ;; (print "my-mt-waitons=" my-mt-waitons)
(for-each
(lambda (waiton)
(format temp-port (conc " " waiton " -> " testname " [splines=ortho]\n")))
- waitons)))
+ (append waitons my-mt-waitons))))
all-testnames)
(format temp-port "}\n")
(close-output-port temp-port)
(with-input-from-pipe
(conc "env -i PATH=$PATH dot -T" outtype " < " temp-path)
@@ -1745,17 +1764,19 @@
(conc " size=\"" (or sizex 11) "," (or sizey 11) "\";")
" ratio=0.95;"
)))
(let* ((testrec (hash-table-ref test-records hed))
(waitons (or (tests:testqueue-get-waitons testrec) '()))
+ (my-mt-waitons (tests:get-mt-waitons hed #t))
+ (all-waitons (delete-duplicates (append waitons my-mt-waitons)))
(newres (append res
- (if (null? waitons)
+ (if (null? all-waitons)
(list (conc " \"" hed "\" [shape=box];"))
(map (lambda (waiton)
(conc " \"" waiton "\" -> \"" hed "\" [shape=box];"))
- waitons)
- ))))
+ all-waitons)))))
+ ;; (debug:print 0 *default-log-port* "For test "hed" got "all-waitons)
(if (null? tal)
(append newres (list "}"))
(loop (car tal)(cdr tal) newres)
))))))
@@ -1773,27 +1794,34 @@
(close-input-port inp)
res)))
;; read data from tmp file or create if not exists
;; if exists regen in background
+;; mode: raw (return data as read) or munged (convert to list of lists and remove " from strings)
;;
-(define (tests:lazy-dot testrecords outtype sizex sizey)
+(define (tests:lazy-dot testrecords outtype sizex sizey mode)
(let ((dfile (conc "/tmp/." (current-user-name) "-" (server:mk-signature) ".dot"))
(fname (conc "/tmp/." (current-user-name) "-" (server:mk-signature) ".dotdat")))
(tests:write-dot-file testrecords dfile sizex sizey)
- (if (common:file-exists? fname)
- (let ((res (with-input-from-file fname
- (lambda ()
- (read-lines)))))
- (system (conc "env -i PATH=$PATH dot -T " outtype " < " dfile " > " fname "&"))
- res)
- (begin
- (system (conc "env -i PATH=$PATH dot -T " outtype " < " dfile " > " fname))
- (with-input-from-file fname
- (lambda ()
- (read-lines)))))))
-
+ (let ((data (if (common:file-exists? fname)
+ (let ((res (with-input-from-file fname
+ (lambda ()
+ (read-lines)))))
+ (system (conc "env -i PATH=$PATH dot -T " outtype " < " dfile " > " fname "&"))
+ res)
+ (begin
+ (system (conc "env -i PATH=$PATH dot -T " outtype " < " dfile " > " fname))
+ (with-input-from-file fname
+ (lambda ()
+ (read-lines)))))))
+ (if (eq? mode 'raw)
+ data
+ (map (lambda (inl)
+ (map (lambda (s)
+ (string-substitute "\"" "" s #t))
+ (string-split inl)))
+ data)))))
;; for each test:
;;
(define (tests:filter-non-runnable run-id testkeynames testrecordshash)
(let ((runnables '()))
@@ -1944,28 +1972,52 @@
tdb
"SELECT count(id) FROM test_rundat;")
res))
0)
-(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)
- (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))
- (if (and cpuload diskfree)
- (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
- (if minutes
- (rmt:general-call 'update-run-duration run-id minutes test-id))
- (if (and uname hostname)
- (rmt:general-call 'update-uname-host run-id uname hostname test-id)))
+;;
+(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f))
+ (if (get-environment-variable "MT_TEST_RUN_DIR")
+ (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data"))
+ (or-dash (lambda (instr)
+ (cond
+ ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken
+ ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr))
+ (else instr))))
+ (file-new (not (directory-exists? dest-dir))))
+ (if file-new (create-directory dest-dir #t))
+ (let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append)))
+ (with-output-to-port outp
+ (lambda ()
+ (if file-new
+ (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname"))
+ (print (current-seconds) "," (or-dash run-id) "," (or-dash test-id) ","
+ (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree) ","
+ (or-dash minutes) "," (or-dash hostname) ","
+ (or-dash uname)))) ;; put uname last as it has spaces in it
+ (close-output-port outp)))
+ (begin
+ (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))))
+ (if update-db
+ (begin
+ (if (and cpuload diskfree)
+ (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
+ (if minutes
+ (rmt:general-call 'update-run-duration run-id minutes test-id))
+ (if (and uname hostname)
+ (rmt:general-call 'update-uname-host run-id uname hostname test-id)))))
;; This one is for running with no db access (i.e. via rmt: internally)
-(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
+(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f))
;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
;; (let ((remtries 10))
(let* ((cpuload (get-cpu-load))
(diskfree (get-df (current-directory)))
+ (tmpfree (get-df "/tmp"))
(uname (get-uname "-srvpio"))
(hostname (get-host-name)))
- (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
+ (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree)))
;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
(let* ((cpuload (get-cpu-load))
(diskfree (get-df (current-directory)))