Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,17 @@
# along with Megatest. If not, see .
TODO
====
+WW38
+. Add test_rundat to no-sync ==> correction, put in /.meta/test-run.dat
+. Add STATE/STATUS transitions to .meta/test-run.dat or similar
+. Swizzle update-test-rundat to operate on no-sync
+. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync
+. On state/status change update tests table with duration
+
WW15
. fill newview matrix with data, filter pipeline gui elements
. improve [script], especially indent handling
WW16
@@ -35,11 +42,10 @@
. break command line into sections; all, run control, queries, utilities etc.
. pull in ftfplan (not integrated, just code pulled in)
WW20
. ./configure => ubuntu, sles11, sles12, rh7
-. Jenkins junit XML support
. Add output flushing in teamcity support
. Switch to using simple runs query everywhere
. Add end_time to runs and add a rollup call that sets state, status and end_time
Future
Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -570,11 +570,11 @@
(let* ((fullname (conc "logs/" file)))
(if (directory? fullname)
(debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
(handle-exceptions
exn
- (debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
+ (debug:print-info 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
(delete-file* fullname)))))
files)
(debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))
;; Force a megatest cleanup-db if version is changed and skip-version-check not specified
@@ -1315,25 +1315,32 @@
(debug:print-info 0 *default-log-port* "using testpatt " args-testpatt " rtestpatt:" rtestpatt)
args-testpatt))))
-(define (common:false-on-exception thunk #!key (message #f))
- (handle-exceptions exn
+;;
+(define (common:false-on-exception thunk #!key (message #f)(tries 1))
+ (handle-exceptions
+ exn
(begin
(if message
- (debug:print-info 0 *default-log-port* message))
- #f) (thunk) ))
-
-(define (common:file-exists? path-string #!key (silent #f))
- ;; this avoids stack dumps in the case where
-
- ;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q...
- (common:false-on-exception (lambda () (file-exists? path-string))
+ (debug:print-info 0 *default-log-port* message " exn=" exn))
+ (if (> tries 1)
+ (begin
+ (thread-sleep! 1)
+ (common:false-on-exception thunk message: message tries: (- tries 1)))
+ #f))
+ (thunk)))
+
+(define (common:file-exists? path-string #!key (silent #f)(tries 1))
+ ;; this avoids stack dumps in the case where NFS is slow or flakey
+ (common:false-on-exception
+ (lambda ()(file-exists? path-string))
message: (if (not silent)
(conc "Unable to access path: " path-string)
#f)
+ tries: tries
))
(define (common:directory-exists? path-string)
;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q...
(common:false-on-exception (lambda () (directory-exists? path-string))
Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -212,10 +212,12 @@
(hash-table-set!
(dboard:commondat-tabdats commondat)
tabnum
tabdat))
+(define *updater-running* #f) ;; move this into one of the stucts
+
;; gets and calls updater list based on curr-tab-num
;;
(define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
(if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
(let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat)))
@@ -240,11 +242,13 @@
tnum
(cons updater curr-updaters))))
;; data for each specific tab goes here
;;
-(defstruct dboard:tabdat
+(defstruct dboard:tabdat
+ ;; flags
+ ((already-running #f) : boolean) ;; the updater is already running. skip
;; runs
((allruns '()) : list) ;; list of dboard:rundat records
((allruns-by-id (make-hash-table)) : hash-table) ;; hash of run-id -> dboard:rundat records
((done-runs '()) : list) ;; list of runs already drawn
((not-done-runs '()) : list) ;; list of runs not yet drawn
@@ -645,11 +649,11 @@
;;
(define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)
(let* ((start-time (current-seconds))
(access-mode (dboard:tabdat-access-mode tabdat))
(num-to-get (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
- "200")))
+ "50"))) ;; was 200, which is fine in a normal run area.
(states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
(statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
(do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
(do-not-use-query-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
(sort-info (get-curr-sort))
@@ -716,10 +720,11 @@
(if got-all
(begin
(dboard:rundat-last-update-set! run-dat (- start-time 2))
(dboard:rundat-run-data-offset-set! run-dat 0))
(begin
+ ;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help
(dboard:rundat-run-data-offset-set! run-dat
(+ num-to-get (dboard:rundat-run-data-offset run-dat)))))
(for-each
(lambda (tdat)
@@ -833,11 +838,13 @@
;;
;; create a virtual table of all the tests
;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") )
;;
(define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts)
- (let* ((access-mode (dboard:tabdat-access-mode tabdat))
+ (dboard:tabdat-already-running-set! tabdat #t)
+ (let* (;; (already-running (dboard:tabdat-already-running tabdat))
+ (access-mode (dboard:tabdat-access-mode tabdat))
(keys (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys)))
(last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2))
(allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts))
;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f))
(allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname")
@@ -901,27 +908,26 @@
(hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
(hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
(if (or (null? tal)
(> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
(begin
- (when (> elapsed-time 2)
+ #;(when (> elapsed-time 2)
(debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
(let* ((old-val (iup:attribute *tim* "TIME"))
(new-val (number->string (inexact->exact (floor (* 2 (string->number old-val)))))))
(if (< (string->number new-val) 5000)
- ((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
- (iup:attribute-set! *tim* "TIME" new-val))))
-
-
- )
+ (begin
+ (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
+ (iup:attribute-set! *tim* "TIME" new-val)))))
(dboard:tabdat-allruns-set! tabdat new-res)
maxtests)
(if (> (dboard:rundat-run-data-offset run-struct) 0)
(loop run tal new-res newmaxtests) ;; not done getting data for this run
(loop (car tal)(cdr tal) new-res newmaxtests)))))))
(dboard:tabdat-filters-changed-set! tabdat #f)
- (dboard:update-tree tabdat runs-hash header tb)))
+ (dboard:update-tree tabdat runs-hash header tb)
+ (dboard:tabdat-already-running-set! tabdat #f)))
(define *collapsed* (make-hash-table))
(define (toggle-hide lnum uidat) ; fulltestname)
(let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum))
@@ -2505,14 +2511,10 @@
;; insert extra widget here
(if extra-widget
extra-widget
(iup:hbox)) ;; empty widget
-
-
-
-
)))
(let* ((status-toggles (map (lambda (status)
(iup:toggle (conc status)
#:fontsize 8 ;; btn-fontsz ;; "10"
@@ -3725,28 +3727,36 @@
;; (define (tabdat-values tabdat)
;; runs update-rundat using the various filters from the gui
;;
(define (dashboard:do-update-rundat tabdat)
+ ;; this seems like a good place to check for already running and skip if so
+ ;;
+ ;; (set! *updater-running* #t)
+;;(if (dboard:tabdat-already-running tabdat)
+;; (begin
+;; (debug:print-info 0 *default-log-port* "Dashboard overloaded - updates will be slow, skipping update.")
+;; (dboard:tabdat-target tabdat))
(dboard:update-rundat
tabdat
(hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%")
(dboard:tabdat-numruns tabdat)
(hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%")
;; generate key patterns from the target stored in tabdat
(let* ((dbkeys (dboard:tabdat-dbkeys tabdat)))
(let ((fres (if (dboard:tabdat-target tabdat)
- (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
- (map (lambda (k v)(list k v)) dbkeys ptparts))
- (let ((res '()))
- (for-each (lambda (key)
- (if (not (equal? key "runname"))
- (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
- (if val (set! res (cons (list key val) res))))))
- dbkeys)
- res))))
- fres))))
+ (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
+ (map (lambda (k v)(list k v)) dbkeys ptparts))
+ (let ((res '()))
+ (for-each (lambda (key)
+ (if (not (equal? key "runname"))
+ (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
+ (if val (set! res (cons (list key val) res))))))
+ dbkeys)
+ res))))
+ fres)))
+ #;(set! *updater-running* #f))
(define (dashboard:runs-tab-updater commondat tab-num)
(debug:catch-and-dump
(lambda ()
(let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))
@@ -3801,22 +3811,25 @@
(dashboard:runs-tab-updater commondat 1))
tab-num: 2)
(iup:callback-set! *tim*
"ACTION_CB"
(lambda (time-obj)
- (let ((update-is-running #f))
- (mutex-lock! (dboard:commondat-update-mutex commondat))
- (set! update-is-running (dboard:commondat-updating commondat))
- (if (not update-is-running)
- (dboard:commondat-updating-set! commondat #t))
- (mutex-unlock! (dboard:commondat-update-mutex commondat))
- (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
- (begin
- (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
- (mutex-lock! (dboard:commondat-update-mutex commondat))
- (dboard:commondat-updating-set! commondat #f)
- (mutex-unlock! (dboard:commondat-update-mutex commondat)))
+ (if (not *updater-running*)
+ (begin
+ ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+ ;; (set! update-is-running (dboard:commondat-updating commondat))
+ ;;(if (not update-is-running)
+ ;; (dboard:commondat-updating-set! commondat #t))
+ ;;(mutex-unlock! (dboard:commondat-update-mutex commondat))
+ ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
+ ;; (begin
+ (set! *updater-running* #t)
+ (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
+ (set! *updater-running* #f)
+ ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+ ;; (dboard:commondat-updating-set! commondat #f)
+ ;; (mutex-unlock! (dboard:commondat-update-mutex commondat)))
))
1))))
(let ((th1 (make-thread (lambda ()
(thread-sleep! 1)
Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -1776,15 +1776,33 @@
#f
)
(with-input-from-file infile read-lines)
)))
-;; select end_time-now from
-;; (select testname,item_path,event_time+run_duration as
-;; end_time,strftime('%s','now') as now from tests where state in
-;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
-
+;; check duration against test-run.dat file if it exists and update the value in
+;; the db if necessary
+;;
+(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
+ (let* ((datf (conc run-dir ".mt_data/test-run.dat"))
+ (modt (if (and (file-exists? datf)
+ (file-read-access? datf))
+ (file-modification-time datf)
+ #f)) ;; (+ event-time run-duration))))
+ (alt-run-duration (if modt
+ (- modt event-time)
+ #f)))
+ (if (and alt-run-duration
+ (> alt-run-duration run-duration))
+ (begin
+ (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
+ (db:with-db
+ dbstruct #f #f
+ (lambda (db)
+ (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
+ #t)))
+ #f))) ;; #f = we did NOT adjust the time
+
(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
(let* ((incompleted '())
(oldlaunched '())
(toplevels '())
;; The default running-deadtime is 720 seconds = 12 minutes.
@@ -1828,37 +1846,39 @@
;; (db:test-get-run_duration testdat)))
;; 600)
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
- (if (and (equal? uname "n/a")
- (equal? item-path "")) ;; this is a toplevel test
- ;; what to do with toplevel? call rollup?
- (begin
- (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
- (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
- (begin
- (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
- (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
- test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
- " event-time="event-time" run-duration="run-duration))))
+ (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+ (if (and (equal? uname "n/a")
+ (equal? item-path "")) ;; this is a toplevel test
+ ;; what to do with toplevel? call rollup?
+ (begin
+ (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+ (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+ (begin
+ (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
+ (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
+ test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
+ " event-time="event-time" run-duration="run-duration)))))
stmth1
run-id running-deadtime) ;; default time 720 seconds
-
+
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
- (if (and (equal? uname "n/a")
- (equal? item-path "")) ;; this is a toplevel test
- ;; what to do with toplevel? call rollup?
- (begin
- (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
- (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
- (begin
- (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
- " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
- " run-duration="run-duration)
- (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
+ (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+ (if (and (equal? uname "n/a")
+ (equal? item-path "")) ;; this is a toplevel test
+ ;; what to do with toplevel? call rollup?
+ (begin
+ (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+ (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+ (begin
+ (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
+ " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
+ " run-duration="run-duration)
+ (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))))
stmth2
run-id remotehoststart-deadtime) ;; default time 230 seconds
;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
;;
@@ -3047,11 +3067,11 @@
;;
;; 1. cache tests-match-qry
;; 2. compile qry and store in hash
;; 3. convert for-each-row to fold
;;
-(define (db:get-tests-for-run-state-status dbstruct run-id testpatt)
+#;(define (db:get-tests-for-run-state-status dbstruct run-id testpatt)
(db:with-db
dbstruct run-id #f
(lambda (db)
(let* ((res '())
(stmt-cache (dbr:dbstruct-stmt-cache dbstruct))
@@ -3467,11 +3487,11 @@
(lambda (run-id)
(let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
(db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
run-ids)))
-;; Get test data using test_id, run-id is not used
+;; Get test data using test_id, run-id is not used - but it will be!
;;
(define (db:get-test-info-by-id dbstruct run-id test-id)
(db:with-db
dbstruct
#f ;; run-id
Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -205,17 +205,19 @@
(current-seconds)
start-seconds)))))
(kill-tries 0))
;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
- (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+ (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)
(let loop ((minutes (calc-minutes))
(cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(disk-free (get-df (current-directory)))
(last-sync (current-seconds)))
- (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
+ ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
+ ;; top of loop encountered at "(current-seconds)" with
+ ;; last-sync="last-sync))
(let* ((over-time (> (current-seconds) (+ last-sync update-period)))
(new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(delta (abs (- load cpu-load))))
(if (> delta 0.1) ;; don't bother updating with small changes
load
@@ -233,33 +235,28 @@
(test-info (rmt:get-test-info-by-id run-id test-id))
(state (db:test-get-state test-info))
(status (db:test-get-status test-info))
(kill-reason "no kill reason specified")
(kill-job? #f))
- (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+ #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
((test-get-kill-request run-id test-id)
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
((equal? status "DEAD")
- (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
(rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
(set! kill-job? #f)))
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(launch:handle-zombie-tests run-id)
- (when do-sync
- ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
- ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
- (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
- (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
- (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
-
+ (if do-sync ;; save meta data about the running of this test
+ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
(if kill-job?
(begin
(debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
(mutex-lock! m)
;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
@@ -312,11 +309,11 @@
(if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(loop (calc-minutes)
(or new-cpu-load cpu-load)
(or new-disk-free disk-free)
(if do-sync (current-seconds) last-sync)))))))
- (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+ (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional
(define (launch:execute encoded-cmd)
(let* ((cmdinfo (common:read-encoded-string encoded-cmd))
(tconfigreg #f))
@@ -465,11 +462,13 @@
(debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.")
(exit))))
(test-pid (db:test-get-process_id test-info)))
(cond
;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag.
- ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+ ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun
+ (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked
+ (member (db:test-get-status test-info) '("ABORT"))))
(debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request")
;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a")
(rmt:general-call 'set-test-start-time #f test-id)
(rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f)
@@ -771,11 +770,13 @@
(if (not (null? tal))
(loop (car tal) (cdr tal)))))))))))
(define (launch:is-test-alive host pid)
(if (and host pid (not (equal? host "n/a")))
- (let* ((cmd (conc "ssh " host " pstree -A " pid))
+ (let* ((is-local (equal? host (get-host-name)))
+ (ssh-cmd (if is-local " " (conc "ssh " host " ")))
+ (cmd (conc ssh-cmd "pstree -A " pid))
(output (with-input-from-pipe cmd read-lines)))
(debug:print 2 *default-log-port* "Running " cmd " received " output)
(if (eq? (length output) 0)
#f
#t))
@@ -1306,12 +1307,12 @@
(debug:print 2 *default-log-port* " - creating run area in " test-path)
(handle-exceptions
exn
(begin
(debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn)
- ", exiting, exn=" exn)
- (exit 1))
+ ", continuing (might cause downstream issues?), exn=" exn)
+ #f)
(create-directory test-path #t))
(debug:print 2 *default-log-port*
" - creating link from: " test-path "\n"
" to: " lnktarget)
Index: megatest-version.scm
==================================================================
--- megatest-version.scm
+++ megatest-version.scm
@@ -18,6 +18,6 @@
;; Always use two or four digit decimal
;; 1.01, 1.02...1.10,1.11,1.1101 ... 1.99,2.00..
;; (declare (unit megatest-version))
-(define megatest-version 1.6569)
+(define megatest-version 1.6572)
Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -22,10 +22,12 @@
(declare (unit rmt))
(declare (uses api))
(declare (uses http-transport))
(include "common_records.scm")
+(include "db_records.scm")
+
;; (declare (uses rmtmod))
;; (import rmtmod)
;;
@@ -525,15 +527,24 @@
(rmt:general-call 'register-test run-id run-id test-name item-path))
(define (rmt:get-test-id run-id testname item-path)
(rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
-;; run-id is NOT used
+;; run-id is NOT used - but it will be!
;;
(define (rmt:get-test-info-by-id run-id test-id)
(if (number? test-id)
- (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
+ (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)))
+ (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat")))
+ ;; now we can update a couple fields from the filesystem
+ (if (and (db:test-get-rundir testdat)
+ (file-exists? trundatf))
+ (let* ((duration (db:test-get-run_duration testdat))
+ (event-time (db:test-get-event_time testdat))
+ (last-touch (file-modification-time trundatf)))
+ (db:test-set-run_duration! testdat (max duration (- last-touch event-time)))))
+ testdat)
(begin
(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
(print-call-chain (current-error-port))
#f)))
Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -60,11 +60,15 @@
)
(defstruct runs:testdat
hed tal reg reruns test-record
test-name item-path jobgroup
- waitons testmode newtal itemmaps prereqs-not-met)
+ waitons testmode newtal
+ itemmaps
+ (prereqs-not-met #f)
+ (last-update 0) ;;
+ )
;; look in the $MT_RUN_AREA_HOME/.softlocks directory for key-host-pid.softlock files
;; - remove any that are over 3600 seconds old
;; - if there are any that are younger than 10 seconds
;; * sleep 10 seconds
@@ -828,10 +832,24 @@
;; reg - list of previously visited tests
;; tal - list of never visited tests
;; prefer next hed to be from reg than tal.
(define runs:nothing-left-in-queue-count 0)
+
+;; cache the result of get-prereqs-not-met and don't call it if called in past 10 seconds
+;; NOTE: This is assuming that testdat is highly specific to this test
+;;
+(define (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path #!key (mode '(normal))(itemmaps #f))
+ ;; mode: testmode itemmaps: itemmaps)
+ (if (and (runs:testdat-prereqs-not-met testdat)
+ (< (- (current-seconds) (runs:testdat-last-update testdat)) 10)) ;;; only refresh for this test if
+ ;;; it has been at least 10 seconds
+ (runs:testdat-prereqs-not-met testdat) ;; return the cached result
+ (let* ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: mode itemmaps: itemmaps)))
+ (runs:testdat-prereqs-not-met-set! testdat res)
+ (runs:testdat-last-update-set! testdat (current-seconds))
+ res)))
;;======================================================================
;; runs:expand-items is called by runs:run-tests-queue
;;======================================================================
;;
@@ -838,22 +856,18 @@
;; return value of runs:expand-items is passed back to runs-tests-queue and is fed to named loop with this signature:
;; (let loop ((hed (car sorted-test-names))
;; (tal (cdr sorted-test-names))
;; (reg '()) ;; registered, put these at the head of tal
;; (reruns '()))
-(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps)
+(define (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs
+ run-id waitons item-path testmode test-record
+ can-run-more items runname tconfig reglen test-registry
+ test-records itemmaps testdat)
(let* ((loop-list (list hed tal reg reruns))
- (prereqs-not-met (let ((res (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps)))
- (if (list? res)
- res
- (begin
- (debug:print 0 *default-log-port*
- "ERROR: rmt:get-prereqs-not-met returned non-list!\n"
- " res=" res " run-id=" run-id " waitons=" waitons " hed=" hed " item-path=" item-path " testmode=" testmode " itemmaps=" itemmaps)
- '()))))
- (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait)))))
- ;; (prereqs-not-met (mt:lazy-get-prereqs-not-met run-id waitons item-path mode: testmode itemmap: itemmap))
+ (prereqs-not-met (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path
+ mode: testmode itemmaps: itemmaps))
+ (have-itemized (not (null? (lset-intersection eq? testmode '(itemmatch itemwait)))))
(fails (runs:calc-fails prereqs-not-met))
(prereq-fails (runs:calc-prereq-fail prereqs-not-met))
(non-completed (runs:calc-not-completed prereqs-not-met))
(runnables (runs:calc-runnable prereqs-not-met))
(unexpanded-prereqs
@@ -1715,11 +1729,13 @@
(loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
(- remtries 1)))))))
)))))
;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
- (runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
+ (runs:lazy-get-prereqs-not-met testdat run-id waitons hed item-path
+ mode: testmode
+ itemmaps: itemmaps)
;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
(runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
(let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
@@ -1784,18 +1800,19 @@
;; - but only do that if resources exist to kick off the job
;; EXPAND ITEMS
((or (procedure? items)(eq? items 'have-procedure))
(debug:print-info 4 *default-log-port* "cond branch - " "rtq-4")
(let ((can-run-more #f)) ;; (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
- (if (not can-run-more) #;(and (list? can-run-more)
+ (if (not can-run-more) #;(and (list? can-run-more) ;; IDEA, this mechanism may have had some value, make it configurable to test pros/cons TODO
(car can-run-more))
- (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here
+ (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs
+ run-id waitons item-path testmode test-record
+ can-run-more items runname tconfig reglen test-registry
+ test-records itemmaps testdat)))
(if loop-list
(apply loop loop-list)
- (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
- )
- )
+ (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)))
;; if can't run more just loop with next possible test
(loop (car newtal)(cdr newtal) reg reruns))))
;; this case should not happen, added to help catch any bugs
((and (list? items) itemdat)
Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1944,52 +1944,53 @@
tdb
"SELECT count(id) FROM test_rundat;")
res))
0)
-(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)
- (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))
- (if (and cpuload diskfree)
- (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
- (if minutes
- (rmt:general-call 'update-run-duration run-id minutes test-id))
- (if (and uname hostname)
- (rmt:general-call 'update-uname-host run-id uname hostname test-id)))
+;;
+(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f))
+ (if (get-environment-variable "MT_TEST_RUN_DIR")
+ (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data"))
+ (or-dash (lambda (instr)
+ (cond
+ ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken
+ ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr))
+ (else instr))))
+ (file-new (not (directory-exists? dest-dir))))
+ (if file-new (create-directory dest-dir #t))
+ (let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append)))
+ (with-output-to-port outp
+ (lambda ()
+ (if file-new
+ (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname"))
+ (print (current-seconds) "," (or-dash run-id) "," (or-dash test-id) ","
+ (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree) ","
+ (or-dash minutes) "," (or-dash hostname) ","
+ (or-dash uname)))) ;; put uname last as it has spaces in it
+ (close-output-port outp)))
+ (begin
+ (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))))
+ (if update-db
+ (begin
+ (if (and cpuload diskfree)
+ (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
+ (if minutes
+ (rmt:general-call 'update-run-duration run-id minutes test-id))
+ (if (and uname hostname)
+ (rmt:general-call 'update-uname-host run-id uname hostname test-id)))))
;; This one is for running with no db access (i.e. via rmt: internally)
-(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
+(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f))
;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
;; (let ((remtries 10))
(let* ((cpuload (get-cpu-load))
(diskfree (get-df (current-directory)))
+ (tmpfree (get-df "/tmp"))
(uname (get-uname "-srvpio"))
(hostname (get-host-name)))
- (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
-
-;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
-#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
- (let* ((cpuload (get-cpu-load))
- (diskfree (get-df (current-directory)))
- (remtries 10))
- (handle-exceptions
- exn
- (if (> remtries 0)
- (begin
- (print-call-chain (current-error-port))
- (debug:print-info 0 *default-log-port* "WARNING: failed to set meta info. Will try " remtries " more times")
- (set! remtries (- remtries 1))
- (thread-sleep! 10)
- (tests:set-full-meta-info db test-id run-id minutes work-area (- remtries 1)))
- (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn)))
- (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up")
- (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.")
- (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn))
- (debug:print 5 *default-log-port* "exn=" (condition->list exn))
- (debug:print 0 *default-log-port* " status: " ((condition-property-accessor 'sqlite3 'status) exn))
- (print-call-chain (current-error-port))))
- (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)
- )))
+ (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree)))
+
;;======================================================================
;; A R C H I V I N G
;;======================================================================