Index: TODO
==================================================================
--- TODO
+++ TODO
@@ -16,10 +16,17 @@
# along with Megatest. If not, see .
TODO
====
+WW38
+. Add test_rundat to no-sync ==> correction, put in /.meta/test-run.dat
+. Add STATE/STATUS transitions to .meta/test-run.dat or similar
+. Swizzle update-test-rundat to operate on no-sync
+. Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync
+. On state/status change update tests table with duration
+
WW15
. fill newview matrix with data, filter pipeline gui elements
. improve [script], especially indent handling
WW16
@@ -35,11 +42,10 @@
. break command line into sections; all, run control, queries, utilities etc.
. pull in ftfplan (not integrated, just code pulled in)
WW20
. ./configure => ubuntu, sles11, sles12, rh7
-. Jenkins junit XML support
. Add output flushing in teamcity support
. Switch to using simple runs query everywhere
. Add end_time to runs and add a rollup call that sets state, status and end_time
Future
Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -543,11 +543,11 @@
(handle-exceptions
exn
#f
(if (directory? fullname)
(begin
- (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
+ (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
(inc-stat "directories"))
(begin
(delete-file* fullname)
(inc-stat "deleted")))
(hash-table-delete! all-files file)))))))
@@ -567,11 +567,11 @@
(- num-logs max-allowed))))
(for-each
(lambda (file)
(let* ((fullname (conc "logs/" file)))
(if (directory? fullname)
- (debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
+ (debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
(handle-exceptions
exn
(debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
(delete-file* fullname)))))
files)
@@ -1766,24 +1766,37 @@
;; - load-in, load as from uptime, NOT normalized
;; - numcpus, number of cpus, ideally use the real cpus, not threads
;;
(define (common:get-delay load-in numcpus)
(let* ((ratio (/ load-in numcpus))
- (new-option (configf:lookup *configdat* "load" "new-load-method")))
+ (new-option (configf:lookup *configdat* "load" "new-load-method"))
+ (paramstr (or (configf:lookup *configdat* "load" "exp-params")
+ "15 12 1281453987.9543 0.75")) ;; 5 4 10 1"))
+ (paramlst (map string->number (string-split paramstr))))
(if new-option
- (begin
- (cond ((and (>= ratio 0) (< ratio .5))
- 0
- )
- ((and (>= ratio 0.5) (<= ratio .9))
- (* ratio (/ 5 .9)))
- ((and (> ratio .9) (<= ratio 1.1))
- (+ 5 (* (- ratio .9) (/ 55 .2))))
- ((> ratio 1.1)
- 60)))
- (max (/ (expt 5 (* 4 ratio)) 10) 0))))
-
+ (begin
+ (cond ((and (>= ratio 0) (< ratio .5))
+ 0)
+ ((and (>= ratio 0.5) (<= ratio .9))
+ (* ratio (/ 5 .9)))
+ ((and (> ratio .9) (<= ratio 1.1))
+ (+ 5 (* (- ratio .9) (/ 55 .2))))
+ ((> ratio 1.1)
+ 60)))
+ (match paramlst
+ ((r1 r2 s1 s2)
+ (debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2)
+ (min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30))
+ (else
+ (debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr)
+ 30)))))
+
+(define (common:print-delay-table)
+ (let loop ((x 0))
+ (print x "," (common:get-delay x 1))
+ (if (< x 2)
+ (loop (+ x 0.1)))))
(define (get-cpu-load #!key (remote-host #f))
(car (common:get-cpu-load remote-host)))
;; (let* ((load-res (process:cmd-run->list "uptime"))
;; (load-rx (regexp "load average:\\s+(\\d+)"))
@@ -1800,30 +1813,44 @@
;; get values from cached info from dropping file in logs dir
;; e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 10))
(if *toppath*
- (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
+ (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))
+ (delfile (lambda ()
+ (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
+ (delete-file* fullpath)
+ #f)))
(if (and (file-exists? fullpath)
(file-read-access? fullpath))
(handle-exceptions
exn
(begin
(debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn)
#f)
(debug:print 2 *default-log-port* "reading file " fullpath)
- (let ((real-age (- (current-seconds)(file-change-time fullpath))))
+ (let ((real-age (- (current-seconds)
+ (handle-exceptions
+ exn
+ (begin
+ (debug:print 1 *default-log-port* "Failed to read mod time on file "
+ fullpath ", using 0, exn=" exn)
+ 0)
+ (file-change-time fullpath)))))
(if (< real-age age)
(handle-exceptions
exn
- (begin
- (debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
- (delete-file* fullpath)
- #f)
- (with-input-from-file fullpath read))
+ (delfile)
+ (let* ((res (with-input-from-file fullpath read)))
+ (if (eof-object? res)
+ (begin
+ (delfile)
+ #f)
+ res)))
(begin
- (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
+ (debug:print-info 2 *default-log-port* "file " fullpath
+ " is too old (" real-age" seconds) to trust, skipping reading it")
#f))))
(begin
(debug:print 2 *default-log-port* "not reading file " fullpath)
#f)))
#f))
@@ -2089,33 +2116,38 @@
#f
(common:get-homehost)))
(hh (if hh-dat (car hh-dat) #f)))
(common:wait-for-normalized-load maxnormload msg hh)))
+(define *numcpus-cache* (make-hash-table))
(define (common:get-num-cpus remote-host)
(let* ((actual-host (or remote-host (get-host-name))))
- (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
- (let* ((proc (lambda ()
- (let loop ((numcpu 0)
- (inl (read-line)))
- (if (eof-object? inl)
- (if (> numcpu 0)
- numcpu
- #f) ;; if zero return #f so caller knows that things are not working
- (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
- (+ numcpu 1)
- numcpu)
- (read-line))))))
- (result (if remote-host
- (with-input-from-pipe
- (conc "ssh " remote-host " cat /proc/cpuinfo")
- proc)
- (with-input-from-file "/proc/cpuinfo" proc))))
- (if (and (number? result)
- (> result 0))
- (common:write-cached-info actual-host "num-cpus" result))
- result))))
+ ;; hosts had better not be changing the number of cpus too often!
+ (or (hash-table-ref/default *numcpus-cache* actual-host #f)
+ (let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600)))
+ (let* ((proc (lambda ()
+ (let loop ((numcpu 0)
+ (inl (read-line)))
+ (if (eof-object? inl)
+ (if (> numcpu 0)
+ numcpu
+ #f) ;; if zero return #f so caller knows that things are not working
+ (loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
+ (+ numcpu 1)
+ numcpu)
+ (read-line))))))
+ (result (if remote-host
+ (with-input-from-pipe
+ (conc "ssh " remote-host " cat /proc/cpuinfo")
+ proc)
+ (with-input-from-file "/proc/cpuinfo" proc))))
+ (if (and (number? result)
+ (> result 0))
+ (common:write-cached-info actual-host "num-cpus" result))
+ result))))
+ (hash-table-set! *numcpus-cache* actual-host numcpus)
+ numcpus))))
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5))
(let ((num-cpus (common:get-num-cpus remote-host)))
@@ -2155,11 +2187,14 @@
(effective-host (or remote-host "localhost"))
(normalized-effective-load (/ effective-load numcpus))
(will-wait (> normalized-effective-load maxnormload)))
(if (> recommended-delay 0)
(let* ((actual-delay (min recommended-delay 30)))
- (debug:print-info 0 *default-log-port* "Load control, delaying " actual-delay " seconds to maintain safe load.")
+ (if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
+ (debug:print-info 0 *default-log-port* "Load control, delaying "
+ actual-delay " seconds to maintain safe load. current normalized effective load is "
+ normalized-effective-load"."))
(thread-sleep! actual-delay)))
(cond
;; bad data, try again to get the data
((not will-wait)
Index: dashboard.scm
==================================================================
--- dashboard.scm
+++ dashboard.scm
@@ -212,10 +212,12 @@
(hash-table-set!
(dboard:commondat-tabdats commondat)
tabnum
tabdat))
+(define *updater-running* #f) ;; move this into one of the stucts
+
;; gets and calls updater list based on curr-tab-num
;;
(define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
(if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
(let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat)))
@@ -240,11 +242,13 @@
tnum
(cons updater curr-updaters))))
;; data for each specific tab goes here
;;
-(defstruct dboard:tabdat
+(defstruct dboard:tabdat
+ ;; flags
+ ((already-running #f) : boolean) ;; the updater is already running. skip
;; runs
((allruns '()) : list) ;; list of dboard:rundat records
((allruns-by-id (make-hash-table)) : hash-table) ;; hash of run-id -> dboard:rundat records
((done-runs '()) : list) ;; list of runs already drawn
((not-done-runs '()) : list) ;; list of runs not yet drawn
@@ -645,11 +649,11 @@
;;
(define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)
(let* ((start-time (current-seconds))
(access-mode (dboard:tabdat-access-mode tabdat))
(num-to-get (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
- "200")))
+ "50"))) ;; was 200, which is fine in a normal run area.
(states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
(statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
(do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
(do-not-use-query-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
(sort-info (get-curr-sort))
@@ -716,10 +720,11 @@
(if got-all
(begin
(dboard:rundat-last-update-set! run-dat (- start-time 2))
(dboard:rundat-run-data-offset-set! run-dat 0))
(begin
+ ;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help
(dboard:rundat-run-data-offset-set! run-dat
(+ num-to-get (dboard:rundat-run-data-offset run-dat)))))
(for-each
(lambda (tdat)
@@ -833,11 +838,13 @@
;;
;; create a virtual table of all the tests
;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") )
;;
(define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts)
- (let* ((access-mode (dboard:tabdat-access-mode tabdat))
+ (dboard:tabdat-already-running-set! tabdat #t)
+ (let* (;; (already-running (dboard:tabdat-already-running tabdat))
+ (access-mode (dboard:tabdat-access-mode tabdat))
(keys (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys)))
(last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2))
(allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts))
;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f))
(allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname")
@@ -901,27 +908,26 @@
(hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
(hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
(if (or (null? tal)
(> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
(begin
- (when (> elapsed-time 2)
+ #;(when (> elapsed-time 2)
(debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
(let* ((old-val (iup:attribute *tim* "TIME"))
(new-val (number->string (inexact->exact (floor (* 2 (string->number old-val)))))))
(if (< (string->number new-val) 5000)
- ((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
- (iup:attribute-set! *tim* "TIME" new-val))))
-
-
- )
+ (begin
+ (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
+ (iup:attribute-set! *tim* "TIME" new-val)))))
(dboard:tabdat-allruns-set! tabdat new-res)
maxtests)
(if (> (dboard:rundat-run-data-offset run-struct) 0)
(loop run tal new-res newmaxtests) ;; not done getting data for this run
(loop (car tal)(cdr tal) new-res newmaxtests)))))))
(dboard:tabdat-filters-changed-set! tabdat #f)
- (dboard:update-tree tabdat runs-hash header tb)))
+ (dboard:update-tree tabdat runs-hash header tb)
+ (dboard:tabdat-already-running-set! tabdat #f)))
(define *collapsed* (make-hash-table))
(define (toggle-hide lnum uidat) ; fulltestname)
(let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum))
@@ -2505,14 +2511,10 @@
;; insert extra widget here
(if extra-widget
extra-widget
(iup:hbox)) ;; empty widget
-
-
-
-
)))
(let* ((status-toggles (map (lambda (status)
(iup:toggle (conc status)
#:fontsize 8 ;; btn-fontsz ;; "10"
@@ -3725,28 +3727,36 @@
;; (define (tabdat-values tabdat)
;; runs update-rundat using the various filters from the gui
;;
(define (dashboard:do-update-rundat tabdat)
+ ;; this seems like a good place to check for already running and skip if so
+ ;;
+ ;; (set! *updater-running* #t)
+;;(if (dboard:tabdat-already-running tabdat)
+;; (begin
+;; (debug:print-info 0 *default-log-port* "Dashboard overloaded - updates will be slow, skipping update.")
+;; (dboard:tabdat-target tabdat))
(dboard:update-rundat
tabdat
(hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%")
(dboard:tabdat-numruns tabdat)
(hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%")
;; generate key patterns from the target stored in tabdat
(let* ((dbkeys (dboard:tabdat-dbkeys tabdat)))
(let ((fres (if (dboard:tabdat-target tabdat)
- (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
- (map (lambda (k v)(list k v)) dbkeys ptparts))
- (let ((res '()))
- (for-each (lambda (key)
- (if (not (equal? key "runname"))
- (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
- (if val (set! res (cons (list key val) res))))))
- dbkeys)
- res))))
- fres))))
+ (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
+ (map (lambda (k v)(list k v)) dbkeys ptparts))
+ (let ((res '()))
+ (for-each (lambda (key)
+ (if (not (equal? key "runname"))
+ (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
+ (if val (set! res (cons (list key val) res))))))
+ dbkeys)
+ res))))
+ fres)))
+ #;(set! *updater-running* #f))
(define (dashboard:runs-tab-updater commondat tab-num)
(debug:catch-and-dump
(lambda ()
(let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))
@@ -3801,22 +3811,25 @@
(dashboard:runs-tab-updater commondat 1))
tab-num: 2)
(iup:callback-set! *tim*
"ACTION_CB"
(lambda (time-obj)
- (let ((update-is-running #f))
- (mutex-lock! (dboard:commondat-update-mutex commondat))
- (set! update-is-running (dboard:commondat-updating commondat))
- (if (not update-is-running)
- (dboard:commondat-updating-set! commondat #t))
- (mutex-unlock! (dboard:commondat-update-mutex commondat))
- (if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
- (begin
- (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
- (mutex-lock! (dboard:commondat-update-mutex commondat))
- (dboard:commondat-updating-set! commondat #f)
- (mutex-unlock! (dboard:commondat-update-mutex commondat)))
+ (if (not *updater-running*)
+ (begin
+ ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+ ;; (set! update-is-running (dboard:commondat-updating commondat))
+ ;;(if (not update-is-running)
+ ;; (dboard:commondat-updating-set! commondat #t))
+ ;;(mutex-unlock! (dboard:commondat-update-mutex commondat))
+ ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
+ ;; (begin
+ (set! *updater-running* #t)
+ (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
+ (set! *updater-running* #f)
+ ;; (mutex-lock! (dboard:commondat-update-mutex commondat))
+ ;; (dboard:commondat-updating-set! commondat #f)
+ ;; (mutex-unlock! (dboard:commondat-update-mutex commondat)))
))
1))))
(let ((th1 (make-thread (lambda ()
(thread-sleep! 1)
Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -459,11 +459,11 @@
(print "Attempt to safely close sqlite3 db failed. Trying again. exn=" exn)
(thread-sleep! 3)
(sqlite3:interrupt! db)
(db:safely-close-sqlite3-db db stmtcache try-num: (- try-num 1)))
(if (sqlite3:database? db)
- (let* ((stmts (hash-table-ref/default stmt-cache db #f)))
+ (let* ((stmts (and stmt-cache (hash-table-ref/default stmt-cache db #f))))
(if stmts (map sqlite3:finalize! (hash-table-values stmts)))
(sqlite3:finalize! db)
#t)
#f))))
@@ -826,16 +826,17 @@
)
;; first pass implementation, just insert all changed rows
(for-each
(lambda (targdb)
- (let* ((db (db:dbdat-get-db targdb))
- (drp-trigger (if (member "last_update" field-names)
- (db:drop-trigger db tablename)
- #f))
- (is-trigger-dropped (if (member "last_update" field-names)
- (db:is-trigger-dropped db tablename) #f))
+ (let* ((db (db:dbdat-get-db targdb))
+ (drp-trigger (if (member "last_update" field-names)
+ (db:drop-trigger db tablename)
+ #f))
+ (is-trigger-dropped (if (member "last_update" field-names)
+ (db:is-trigger-dropped db tablename)
+ #f))
(stmth (sqlite3:prepare db full-ins)))
;; (db:delay-if-busy targdb) ;; NO WAITING
(if (member "last_update" field-names)
(debug:print-info 3 *default-log-port* "is-trigger-dropped: " is-trigger-dropped))
(for-each
@@ -1224,29 +1225,29 @@
FOR EACH ROW
BEGIN
UPDATE runs SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )
- (list "update_run_stats_trigger" "CREATE TRIGGER IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats
+ (list "update_run_stats_trigger" "CREATE TRIGGER IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats
FOR EACH ROW
BEGIN
UPDATE run_stats SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )
- (list "update_tests_trigger" "CREATE TRIGGER IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests
+ (list "update_tests_trigger" "CREATE TRIGGER IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests
FOR EACH ROW
BEGIN
UPDATE tests SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )
- (list "update_teststeps_trigger" "CREATE TRIGGER IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps
+ (list "update_teststeps_trigger" "CREATE TRIGGER IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps
FOR EACH ROW
BEGIN
UPDATE test_steps SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )
- (list "update_test_data_trigger" "CREATE TRIGGER IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data
+ (list "update_test_data_trigger" "CREATE TRIGGER IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data
FOR EACH ROW
BEGIN
UPDATE test_data SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )))
@@ -1261,40 +1262,43 @@
(for-each (lambda (key)
(sqlite3:execute db (cadr key)))
db:trigger-list))
(define (db:drop-all-triggers dbstruct)
-(db:with-db
+ (db:with-db
dbstruct #f #f
(lambda (db)
-(db:drop-triggers db))))
+ (db:drop-triggers db))))
(define (db:is-trigger-dropped db tbl-name)
- (let* ((trigger-name (if (equal? tbl-name "test_steps")
- "update_teststeps_trigger"
- (conc "update_" tbl-name "_trigger"))))
- (sqlite3:for-each-row
- (lambda (name)
- ;(print name)
- (set! res (vector name)))
- db
- "select name from sqlite_master where type = 'trigger' ;"
- )))
+ (let* ((trigger-name (if (equal? tbl-name "test_steps")
+ "update_teststeps_trigger"
+ (conc "update_" tbl-name "_trigger")))
+ (res #f))
+ (sqlite3:for-each-row
+ (lambda (name)
+ (if (equal? name trigger-name)
+ (set! res #t)))
+ db
+ "SELECT name FROM sqlite_master WHERE type = 'trigger' ;"
+ )))
(define (db:drop-triggers db)
- (for-each (lambda (key)
- (sqlite3:execute db (conc "drop trigger " (car key))))
- db:trigger-list))
+ (for-each
+ (lambda (key)
+ (sqlite3:execute db (conc "drop trigger if exists " (car key))))
+ db:trigger-list))
(define (db:drop-trigger db tbl-name)
- (let* ((trigger-name (if (equal? tbl-name "test_steps")
- "update_teststeps_trigger"
- (conc "update_" tbl-name "_trigger"))))
- (for-each (lambda (key)
- (if (equal? (car key) trigger-name)
- (sqlite3:execute db (conc "drop trigger " trigger-name))))
- db:trigger-list)))
+ (let* ((trigger-name (if (equal? tbl-name "test_steps")
+ "update_teststeps_trigger"
+ (conc "update_" tbl-name "_trigger"))))
+ (for-each
+ (lambda (key)
+ (if (equal? (car key) trigger-name)
+ (sqlite3:execute db (conc "drop trigger if exists " trigger-name))))
+ db:trigger-list)))
(define (db:create-trigger db tbl-name)
(let* ((trigger-name (if (equal? tbl-name "test_steps")
"update_teststeps_trigger"
(conc "update_" tbl-name "_trigger"))))
@@ -1530,11 +1534,11 @@
state TEXT DEFAULT 'new',
status TEXT DEFAULT 'n/a',
archive_type TEXT DEFAULT 'bup',
du INTEGER,
archive_path TEXT);")))
- (print "creating trigges from init")
+ ;; (print "creating trigges from init")
(db:create-triggers db)
db)) ;; )
;;======================================================================
;; A R C H I V E S
@@ -1772,15 +1776,33 @@
#f
)
(with-input-from-file infile read-lines)
)))
-;; select end_time-now from
-;; (select testname,item_path,event_time+run_duration as
-;; end_time,strftime('%s','now') as now from tests where state in
-;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
-
+;; check duration against test-run.dat file if it exists and update the value in
+;; the db if necessary
+;;
+(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
+ (let* ((datf (conc run-dir ".mt_data/test-run.dat"))
+ (modt (if (and (file-exists? datf)
+ (file-read-access? datf))
+ (file-modification-time datf)
+ #f)) ;; (+ event-time run-duration))))
+ (alt-run-duration (if modt
+ (- modt event-time)
+ #f)))
+ (if (and alt-run-duration
+ (> alt-run-duration run-duration))
+ (begin
+ (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
+ (db:with-db
+ dbstruct #f #f
+ (lambda (db)
+ (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
+ #t)))
+ #f))) ;; #f = we did NOT adjust the time
+
(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
(let* ((incompleted '())
(oldlaunched '())
(toplevels '())
;; The default running-deadtime is 720 seconds = 12 minutes.
@@ -1824,37 +1846,39 @@
;; (db:test-get-run_duration testdat)))
;; 600)
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
- (if (and (equal? uname "n/a")
- (equal? item-path "")) ;; this is a toplevel test
- ;; what to do with toplevel? call rollup?
- (begin
- (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
- (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
- (begin
- (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
- (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
- test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
- " event-time="event-time" run-duration="run-duration))))
+ (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+ (if (and (equal? uname "n/a")
+ (equal? item-path "")) ;; this is a toplevel test
+ ;; what to do with toplevel? call rollup?
+ (begin
+ (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+ (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+ (begin
+ (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
+ (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
+ test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
+ " event-time="event-time" run-duration="run-duration)))))
stmth1
run-id running-deadtime) ;; default time 720 seconds
-
+
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
- (if (and (equal? uname "n/a")
- (equal? item-path "")) ;; this is a toplevel test
- ;; what to do with toplevel? call rollup?
- (begin
- (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
- (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
- (begin
- (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
- " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
- " run-duration="run-duration)
- (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
+ (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
+ (if (and (equal? uname "n/a")
+ (equal? item-path "")) ;; this is a toplevel test
+ ;; what to do with toplevel? call rollup?
+ (begin
+ (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+ (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+ (begin
+ (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
+ " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
+ " run-duration="run-duration)
+ (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))))
stmth2
run-id remotehoststart-deadtime) ;; default time 230 seconds
;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
;;
@@ -2180,12 +2204,12 @@
(if newres
newres
res))
res)))
-(define (db:no-sync-close-db db stmtcache)
- (db:safely-close-sqlite3-db db stmtcache))
+(define (db:no-sync-close-db db stmt-cache)
+ (db:safely-close-sqlite3-db db stmt-cache))
;; transaction protected lock aquisition
;; either:
;; fails returns (#f . lock-creation-time)
;; succeeds (returns (#t . lock-creation-time)
@@ -2196,13 +2220,14 @@
(sqlite3:with-transaction
db
(lambda ()
(handle-exceptions
exn
- (let ((lock-time (current-seconds)))
- (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time)
- `(#t . ,lock-time))
+ (let ((lock-time (current-seconds)))
+ (debug:print-info 2 *default-log-port* "db:no-sync-get-lock keyname=" keyname ", lock-time=" lock-time ", exn=" exn)
+ (sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time)
+ `(#t . ,lock-time))
`(#f . ,(sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname)))))))
;; use a global for some primitive caching, it is just silly to
@@ -2238,11 +2263,12 @@
(n 0))
(if (equal? hed field)
(handle-exceptions
exn
(begin
- (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row=" row " header=" header " field=" field ", exn=" exn)
+ (debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row="
+ row " header=" header " field=" field ", exn=" exn)
#f)
(vector-ref row n))
(if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1)))))))
;; Accessors for the header/data structure
@@ -3461,11 +3487,11 @@
(lambda (run-id)
(let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
(db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
run-ids)))
-;; Get test data using test_id, run-id is not used
+;; Get test data using test_id, run-id is not used - but it will be!
;;
(define (db:get-test-info-by-id dbstruct run-id test-id)
(db:with-db
dbstruct
#f ;; run-id
@@ -4479,11 +4505,11 @@
(db (db:dbdat-get-db dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline
(dbfj (conc dbpath "-journal")))
(if (handle-exceptions
exn
(begin
- (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj)
+ (debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj ", exn=" exn)
(thread-sleep! 1)
(db:delay-if-busy count (- count 1)))
(common:file-exists? dbfj))
(case count
((6)
Index: http-transport.scm
==================================================================
--- http-transport.scm
+++ http-transport.scm
@@ -566,17 +566,17 @@
(debug:print 0 *default-log-port* msg)
(if (common:file-exists? full-serv-fname)
(system (conc "sleep 1;mv -f " full-serv-fname " " new-serv-fname))
(debug:print 0 *default-log-port* "INFO: cannot move " full-serv-fname " to " new-serv-fname))
(exit)))))
- (if (and (not start-time-old) ;; last server start try was less than five seconds ago
+ #;(if (and (not start-time-old) ;; last server start try was less than five seconds ago
(not server-starting))
(begin
(cleanup-proc "NOT starting server, there is either a recently started server or a server in process of starting")
(exit)))
;; lets not even bother to start if there are already three or more server files ready to go
- (let* ((num-alive (server:get-num-alive (server:get-list *toppath*))))
+ #;(let* ((num-alive (server:get-num-alive (server:get-list *toppath*))))
(if (> num-alive 3)
(begin
(cleanup-proc (conc "ERROR: Aborting server start because there are already " num-alive " possible servers either running or starting up"))
(exit))))
(common:save-pkt `((action . start)
Index: launch.scm
==================================================================
--- launch.scm
+++ launch.scm
@@ -205,17 +205,19 @@
(current-seconds)
start-seconds)))))
(kill-tries 0))
;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area)
;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area)
- (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10)
+ (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t)
(let loop ((minutes (calc-minutes))
(cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(disk-free (get-df (current-directory)))
(last-sync (current-seconds)))
- (common:telemetry-log "zombie" (conc "launch:monitor-job - top of loop encountered at "(current-seconds)" with last-sync="last-sync))
+ ;; (common:telemetry-log "zombie" (conc "launch:monitor-job -
+ ;; top of loop encountered at "(current-seconds)" with
+ ;; last-sync="last-sync))
(let* ((over-time (> (current-seconds) (+ last-sync update-period)))
(new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f)))
(delta (abs (- load cpu-load))))
(if (> delta 0.1) ;; don't bother updating with small changes
load
@@ -233,33 +235,28 @@
(test-info (rmt:get-test-info-by-id run-id test-id))
(state (db:test-get-state test-info))
(status (db:test-get-status test-info))
(kill-reason "no kill reason specified")
(kill-job? #f))
- (common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
+ #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period))
(cond
((test-get-kill-request run-id test-id)
(set! kill-reason "KILLING TEST since received kill request (KILLREQ)")
(set! kill-job? #t))
((and runtlim (> (- (current-seconds) start-seconds) runtlim))
(set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim))
(set! kill-job? #t))
((equal? status "DEAD")
- (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
+ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t)
(rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
(set! kill-job? #f)))
(debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
(launch:handle-zombie-tests run-id)
- (when do-sync
- ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
- ;; (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
- (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync started at "(current-seconds)))
- (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
- (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
-
+ (if do-sync ;; save meta data about the running of this test
+ (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))
(if kill-job?
(begin
(debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
(mutex-lock! m)
;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
@@ -312,11 +309,11 @@
(if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta
(loop (calc-minutes)
(or new-cpu-load cpu-load)
(or new-disk-free disk-free)
(if do-sync (current-seconds) last-sync)))))))
- (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f))) ;; NOTE: Checking twice for keep-going is intentional
+ (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional
(define (launch:execute encoded-cmd)
(let* ((cmdinfo (common:read-encoded-string encoded-cmd))
(tconfigreg #f))
@@ -1304,12 +1301,12 @@
(debug:print 2 *default-log-port* " - creating run area in " test-path)
(handle-exceptions
exn
(begin
(debug:print-error 0 *default-log-port* " Failed to create directory " test-path ((condition-property-accessor 'exn 'message) exn)
- ", exiting, exn=" exn)
- (exit 1))
+ ", continuing (might cause downstream issues?), exn=" exn)
+ #f)
(create-directory test-path #t))
(debug:print 2 *default-log-port*
" - creating link from: " test-path "\n"
" to: " lnktarget)
Index: rmt.scm
==================================================================
--- rmt.scm
+++ rmt.scm
@@ -22,10 +22,12 @@
(declare (unit rmt))
(declare (uses api))
(declare (uses http-transport))
(include "common_records.scm")
+(include "db_records.scm")
+
;; (declare (uses rmtmod))
;; (import rmtmod)
;;
@@ -525,15 +527,24 @@
(rmt:general-call 'register-test run-id run-id test-name item-path))
(define (rmt:get-test-id run-id testname item-path)
(rmt:send-receive 'get-test-id run-id (list run-id testname item-path)))
-;; run-id is NOT used
+;; run-id is NOT used - but it will be!
;;
(define (rmt:get-test-info-by-id run-id test-id)
(if (number? test-id)
- (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))
+ (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id)))
+ (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat")))
+ ;; now we can update a couple fields from the filesystem
+ (if (and (db:test-get-rundir testdat)
+ (file-exists? trundatf))
+ (let* ((duration (db:test-get-run_duration testdat))
+ (event-time (db:test-get-event_time testdat))
+ (last-touch (file-modification-time trundatf)))
+ (db:test-set-run_duration! testdat (max duration (- last-touch event-time)))))
+ testdat)
(begin
(debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id)
(print-call-chain (current-error-port))
#f)))
Index: runs.scm
==================================================================
--- runs.scm
+++ runs.scm
@@ -16,11 +16,12 @@
;; along with Megatest. If not, see .
;; strftime('%m/%d/%Y %H:%M:%S','now','localtime')
(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18)
- posix-extras directory-utils pathname-expand typed-records format sxml-serializer sxml-modifications)
+ posix-extras directory-utils pathname-expand typed-records format sxml-serializer
+ sxml-modifications matchable)
(declare (unit runs))
(declare (uses db))
(declare (uses common))
(declare (uses items))
@@ -45,14 +46,19 @@
(defstruct runs:dat
reglen regfull
runname max-concurrent-jobs run-id
test-patts required-tests test-registry
registry-mutex flags keyvals run-info all-tests-registry
- can-run-more-tests
+ ;; stores results from last runs:can-run-more-tests
+ (can-run-more-tests #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)
((can-run-more-tests-count 0) : fixnum)
(last-fuel-check 0) ;; time when we last checked fuel
(beginning-of-time (current-seconds))
+ (load-mgmt-function #f)
+ (wait-for-jobs-function #f)
+ (last-load-check-time 0)
+ (last-jobs-check-time 0)
)
(defstruct runs:testdat
hed tal reg reruns test-record
test-name item-path jobgroup
@@ -89,11 +95,11 @@
lock-files)))
(if fresh-locks
(begin
(if (runs:lownoise "runners-softlock-wait" 360)
(debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time..."))
- (thread-sleep! 10))
+ (thread-sleep! 2))
(begin
(if (runs:lownoise "runners-softlock-nowait" 360)
(debug:print-info 0 *default-log-port* "No runners in flight, updating softlock"))
(let* ((ouf (open-output-file my-lock-file)))
(with-output-to-port ouf
@@ -313,25 +319,12 @@
;; Take advantage of a good place to exit if running the one-pass methodology
(if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
(args:get-arg "-one-pass"))
(exit 0))
- (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move
- ;; to cond clauses below where we determine we
- ;; have too many jobs running rather than each
- ;; time the and condition above is true (which
- ;; seems like always)?
- ((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
- 0)
- ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
- (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
- 10) ;; obviously haven't had any work to do for a while
- (else 0)))
-;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
-;; (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
-;; )))
-
+ (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+
(let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
(num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
(job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
(if (string? jobg-count)
(string->number jobg-count)
@@ -539,11 +532,11 @@
;; register this run in monitor.db
(rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
(rmt:tasks-set-state-given-param-key task-key "running")
- (common:telemetry-log "run-tests"
+ #;(common:telemetry-log "run-tests"
payload:
`( (target . ,target)
(run-name . ,runname)
(test-patts . ,test-patts) ) )
@@ -991,13 +984,17 @@
(null? non-completed))
(debug:print-info 4 *default-log-port* "cond branch - " "ei-4")
(if (runs:can-keep-running? hed 20)
(begin
(runs:inc-cant-run-tests hed)
- (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;;
+ (debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;;
;; getting here likely means the system is way overloaded, kill a full minute before continuing
- (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
+ ;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!!
+ ;; No runsdat, can't do this yet
+ ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+ ;;
+ (thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
;; num-retries code was here
;; we use this opportunity to move contents of reg to tal
(list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
(begin
(debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
@@ -1123,11 +1120,29 @@
(conc " WARNING: t is not a vector=" t )))
prereqs-not-met)
", ") ") fails: " fails
"\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f))
-
+ ;; well, first lets see if cpu load throttling is enabled. If so wait around until the
+ ;; average cpu load is under the threshold before continuing
+ ;;
+ (if (runs:dat-load-mgmt-function runsdat)
+ ((runs:dat-load-mgmt-function runsdat))
+ (runs:dat-load-mgmt-function-set!
+ runsdat
+ (lambda ()
+ ;; jobtools maxload is useful for where the full Megatest run is done on one machine
+ (if (and (not (common:on-homehost?))
+ maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
+ (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
+
+ ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
+ (if maxhomehostload
+ (common:wait-for-homehost-load maxhomehostload
+ (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))
+
+
(if (and (not (null? prereqs-not-met))
(runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))
@@ -1212,29 +1227,17 @@
;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
;; we are going to reset all the counters for test retries by setting a new hash table
;; this means they will increment only when nothing can be run
(set! *max-tries-hash* (make-hash-table))
- ;; well, first lets see if cpu load throttling is enabled. If so wait around until the
- ;; average cpu load is under the threshold before continuing
-
- ;; jobtools maxload is useful for where the full Megatest run is done on one machine
- (if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
- (common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
-
- ;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
- (if maxhomehostload
- (common:wait-for-homehost-load maxhomehostload
- (conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
-
- (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry)
+ (run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat)
(runs:incremental-print-results run-id)
(hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
(runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
;; (thread-sleep! *global-delta*)
(if (or (not (null? tal))(not (null? reg)))
- (runs:loop-values tal reg reglen regfull reruns)
+ (runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time
#f))
;; must be we have unmet prerequisites
;;
(else
@@ -1251,19 +1254,18 @@
(member 'toplevel testmode))
(begin
;; couldn't run, take a breather
(if (runs:lownoise "Waiting for more work to do..." 60)
(debug:print-info 0 *default-log-port* "Waiting for more work to do..."))
- (thread-sleep! 1)
+
+ ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+ (thread-sleep! 5)
(list (car newtal)(cdr newtal) reg reruns))
;; the waiton is FAIL so no point in trying to run hed ever again
(begin
(let ((my-test-id (rmt:get-test-id run-id test-name item-path)))
- (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))
-
-
-
+ (mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))
(if (or (not (null? reg))(not (null? tal)))
(if (vector? hed)
(begin
(debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path
" from the launch list as it has prerequistes that are FAIL")
@@ -1581,19 +1583,10 @@
itemmaps: itemmaps
;; prereqs-not-met: prereqs-not-met
)))
(runs:dat-regfull-set! runsdat regfull)
- ;; -- removed BB 17ww28 - no longer needed.
- ;; every 15 minutes verify the server is there for this run
- ;; (if (and (common:low-noise-print 240 "try start server" run-id)
- ;; (not (or (and *runremote*
- ;; (remote-server-url *runremote*)
- ;; (server:ping (remote-server-url *runremote*)))
- ;; (server:check-if-running *toppath*))))
- ;; (server:kind-run *toppath*))
-
(if (> num-running 0)
(set! last-time-some-running (current-seconds)))
(if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
(hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
@@ -1636,11 +1629,11 @@
"\n regfull: " regfull
"\n reglen: " reglen
"\n length reg: " (length reg)
)
- (runs:parallel-runners-mgmt runsdat)
+ ;; (runs:parallel-runners-mgmt runsdat)
;; check for hed in waitons => this would be circular, remove it and issue an
;; error
(if (member test-name waitons)
(begin
@@ -1672,19 +1665,67 @@
(debug:print-info 4 *default-log-port* "cond branch - " "rtq-2")
(debug:print-info 4 *default-log-port* "OUTER COND: (not items)")
(if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
(not (null? tal)))
(loop (car tal)(cdr tal) reg reruns))
+ ;;(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
+
+ ;; gonna try a strategy change here.
+ ;;
+ ;; check if can run more tests. if yes, continue, if no, rest until can run more
+ ;; look at the test jobgroup and tot jobs running
+ ;;
+ ;; NOTE: This does NOT actually gate here, only captures the proc to be called later
+ ;;
+ (if (not (runs:dat-wait-for-jobs-function runsdat))
+ (runs:dat-wait-for-jobs-function-set!
+ runsdat
+ (lambda (testdat-in)
+ (let* ((jobgroup (runs:testdat-jobgroup testdat-in))
+ (can-run-more-tests (runs:dat-can-run-more-tests runsdat))
+ (last-jobs-check-time (runs:dat-last-jobs-check-time runsdat))
+ (should-check-jobs (match can-run-more-tests
+ ((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params)
+ (if (< (- max-concurrent-jobs num-running) 25)
+ (begin
+ (debug:print-info 0 *default-log-port*
+ "less than 20 jobs headroom, ("max-concurrent-jobs
+ "-"num-running")>20. Forcing prelaunch check.")
+ #t)
+ #f))
+ (else #f)))) ;; no record yet
+ (if should-check-jobs
+ (let loop-can-run-more
+ ((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
+ (remtries 1440)) ;; we can wait for up to two hours for jobs to get done
+ (match res
+ ((run-more num-running . rem)
+ (if (or run-more
+ (< remtries 1))
+ (begin
+ (if (runs:lownoise "num-running" 30)
+ (debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs))
+ (runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through
+ (begin
+ (if (runs:lownoise "num-running" 10)
+ (debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of "
+ max-concurrent-jobs " allowed."))
+ (thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable
+
+ ;; wait for load here
+ (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+ (loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
+ (- remtries 1)))))))
+ )))))
+
+ ;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
- ;; This would be a good place to block on homehost load
-
+ ;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
+ (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
-
-
- (runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
- (let ((loop-list (runs:process-expanded-tests runsdat testdat)))
+ (let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
(if loop-list (apply loop loop-list))))
;; items processed into a list but not came in as a list been processed
;;
((and (list? items) ;; thus we know our items are already calculated
@@ -1743,19 +1784,20 @@
;; if items is a proc then need to run items:get-items-from-config, get the list and loop
;; - but only do that if resources exist to kick off the job
;; EXPAND ITEMS
((or (procedure? items)(eq? items 'have-procedure))
(debug:print-info 4 *default-log-port* "cond branch - " "rtq-4")
- (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
+ (let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
(if (and (list? can-run-more)
- (car can-run-more))
- (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here
+ (car can-run-more)) ;; itemized test expanded here
+ (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup
+ max-concurrent-jobs run-id waitons item-path
+ testmode test-record can-run-more items runname
+ tconfig reglen test-registry test-records itemmaps)))
(if loop-list
(apply loop loop-list)
- (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
- )
- )
+ (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)))
;; if can't run more just loop with next possible test
(loop (car newtal)(cdr newtal) reg reruns))))
;; this case should not happen, added to help catch any bugs
((and (list? items) itemdat)
@@ -1788,11 +1830,13 @@
;; this is the point where everything is launched and now you can mark the run in metadata table as all launched
(rmt:set-var (conc "lunch-complete-" run-id) "yes")
;; now *if* -run-wait we wait for all tests to be done
;; Now wait for any RUNNING tests to complete (if in run-wait mode)
+ ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
+
(let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
(prev-num-running 0))
;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
(if (and (or (args:get-arg "-run-wait")
(equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
@@ -1807,11 +1851,12 @@
(set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
(rmt:find-and-mark-incomplete run-id #f)
(debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
" tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
(time->string (seconds->local-time (current-seconds))))))
- (thread-sleep! 5)
+ ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+ (thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
(wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
num-running))))
;; LET* ((test-record
;; we get here on "drop through". All done!
;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed.
@@ -1866,11 +1911,11 @@
(conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t))))
lst))
;; parent-test is there as a placeholder for when parent-tests can be run as a setup step
;;
-(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry)
+(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec)
;; All these vars might be referenced by the testconfig file reader
;;
;; NEED to reprocess testconfig here, ensuring that item variables are available.
;; This is for Tal's issue with item-specific env vars not being set for use in skip.
;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273
@@ -1944,12 +1989,13 @@
(set! test-id (rmt:get-test-id run-id test-name item-path))))
(debug:print-info 4 *default-log-port* "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"")
(set! testdat (rmt:get-test-info-by-id run-id test-id))
(if (not testdat)
(begin
- (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in one second")
- (thread-sleep! 1)
+ (debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in two seconds")
+ ;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
+ (thread-sleep! 2)
(loop)))))
(if (not testdat) ;; should NOT happen
(debug:print-error 0 *default-log-port* "failed to get test record for test-id " test-id))
(set! test-id (db:test-get-id testdat))
(if (common:file-exists? test-path)
@@ -2056,15 +2102,23 @@
(mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test)
(debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test))
;;
;; Here the test is handed off to launch.scm for launch-test to complete the launch process
;;
- (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
- (begin
- (print "ERROR: Failed to launch the test. Exiting as soon as possible")
- (set! *globalexitstatus* 1) ;;
- (process-signal (current-process-id) signal/kill))))))))
+ (begin
+ ;; wait for less than max jobs here
+ (if (runs:dat-wait-for-jobs-function runsdat)
+ ((runs:dat-wait-for-jobs-function runsdat) testdat-rec))
+
+ (if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
+ (begin
+ (print "ERROR: Failed to launch the test. Exiting as soon as possible")
+ (set! *globalexitstatus* 1) ;;
+ (process-signal (current-process-id) signal/kill))
+ )
+ ;; wait again here?
+ ))))))
((KILLED)
(debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.")
(hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED))
((LAUNCHED REMOTEHOSTSTART RUNNING)
(debug:print 2 *default-log-port* "NOTE: " test-name " is already running"))
@@ -2535,11 +2589,11 @@
(if (not (null? tal))
(loop (car tal)(cdr tal))))
((run-wait)
;; BB TODO - manage has-subrun case
(debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running")
- (thread-sleep! 10)
+ (thread-sleep! 5)
(let ((new-tests (proc-get-tests run-id)))
(if (null? new-tests)
(debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.")
(loop (car new-tests)(cdr new-tests)))))
((archive)
Index: server.scm
==================================================================
--- server.scm
+++ server.scm
@@ -202,13 +202,13 @@
(if (if (directory-exists? (conc areapath "/logs"))
'()
(if (file-write-access? areapath)
(begin
(condition-case
- (create-directory (conc areapath "/logs") #t)
- (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
- (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list.")))
+ (create-directory (conc areapath "/logs") #t)
+ (exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
+ (exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
(directory-exists? (conc areapath "/logs")))
'()))
(let* ((server-logs (glob (conc areapath "/logs/server-*.log")))
(num-serv-logs (length server-logs)))
(if (null? server-logs)
@@ -215,15 +215,15 @@
'()
(let loop ((hed (car server-logs))
(tal (cdr server-logs))
(res '()))
(let* ((mod-time (handle-exceptions
- exn
- (begin
- (print "failed to get modification time on " hed ", exn=" exn)
- (current-seconds)) ;; 0
- (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
+ exn
+ (begin
+ (print "failed to get modification time on " hed ", exn=" exn)
+ (current-seconds)) ;; 0
+ (file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
(down-time (- (current-seconds) mod-time))
(serv-dat (if (or (< num-serv-logs 10)
(< down-time 900)) ;; day-seconds))
(server:logf-get-start-info hed)
'())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
@@ -231,16 +231,16 @@
(fmatch (string-match fname-rx hed))
(pid (if fmatch (string->number (list-ref fmatch 2)) #f))
(new-res (if (null? serv-dat)
res
(cons (append serv-rec (list pid)) res))))
- (if (null? tal)
- (if (and limit
- (> (length new-res) limit))
- new-res ;; (take new-res limit) <= need intelligent sorting before this will work
- new-res)
- (loop (car tal)(cdr tal) new-res)))))))))
+ (if (null? tal)
+ (if (and limit
+ (> (length new-res) limit))
+ new-res ;; (take new-res limit) <= need intelligent sorting before this will work
+ new-res)
+ (loop (car tal)(cdr tal) new-res)))))))))
(define (server:get-num-alive srvlst)
(let ((num-alive 0))
(for-each
(lambda (server)
@@ -274,14 +274,15 @@
(mod-time (list-ref rec 0)))
;; (print "start-time: " start-time " mod-time: " mod-time)
(and start-time mod-time
(> (- now start-time) 0) ;; been running at least 0 seconds
(< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds
- (< (- now start-time)
- (+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
- 180)
- (random 360))) ;; under one hour running time +/- 180
+ (or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set
+ (< (- now start-time)
+ (+ (- (string->number (configf:lookup *configdat* "server" "runtime"))
+ 180)
+ (random 360)))) ;; under one hour running time +/- 180
))
#f))
srvlst)
(lambda (a b)
(< (list-ref a 3)
@@ -321,25 +322,39 @@
*my-client-signature*)))
;; wait for server=start-last to be three seconds old
;;
(define (server:wait-for-server-start-last-flag areapath)
- (let* ((start-flag (conc areapath "/logs/server-start-last")))
+ (let* ((start-flag (conc areapath "/logs/server-start-last"))
+ ;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
+ (reftime (configf:lookup-number *configdat* "server" "idletime" default: 4))
+ (server-key (conc (get-host-name) "-" (current-process-id))))
(if (file-exists? start-flag)
(let* ((fmodtime (file-modification-time start-flag))
- (reftime (+ 3 (random 5)))
- (delta (- (current-seconds) fmodtime)))
- (if (> delta reftime) ;; good enough
- (begin
- (debug:print-info 0 *default-log-port* "Ready to start server, last start: "
- fmodtime ", delta: " delta ", reftime: " reftime)
- (system (conc "touch " start-flag))) ;; lazy but safe
- (begin
- (thread-sleep! 5)
- (server:wait-for-server-start-last-flag areapath))))
- (system (conc "touch " start-flag)))))
-
+ (delta (- (current-seconds) fmodtime))
+ (all-go (> delta reftime)))
+ (if (handle-exceptions
+ exn
+ (begin
+ (debug:print-info 0 *default-log-port* "Something not right with soft server locking: exn=" exn)
+ #t)
+ (and all-go
+ (begin
+ (with-output-to-file start-flag
+ (lambda ()
+ (print server-key)))
+ (thread-sleep! 0.25)
+ (let ((res (with-input-from-file start-flag
+ (lambda ()
+ (read-line)))))
+ (equal? server-key res)))))
+ #t ;; (system (conc "touch " start-flag)) ;; lazy but safe
+ (begin
+ (debug:print-info 0 *default-log-port* "Gating server start, last start: "
+ fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
+ (thread-sleep! reftime)
+ (server:wait-for-server-start-last-flag areapath)))))))
;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
@@ -356,12 +371,13 @@
((2) 300)
(else 600))
(random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
(lock-file (conc areapath "/logs/server-start.lock")))
(if (> (- (current-seconds) when-run) run-delay)
- (begin
+ (let* ((start-flag (conc areapath "/logs/server-start-last")))
(common:simple-file-lock-and-wait lock-file expire-time: 15)
+ (system (conc "touch " start-flag)) ;; lazy but safe
(server:run areapath)
(thread-sleep! 2) ;; don't release the lock for at least a few seconds
(common:simple-file-release-lock lock-file)))
(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
@@ -574,18 +590,26 @@
(if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
(common:snapshot-file mtdbfile subdir: ".db-snapshot"))
(delete-file* staging-file)
(let* ((start-time (current-milliseconds))
(res (system sync-cmd))
+ (dbbackupfile (conc mtdbfile ".backup"))
(res2
(cond
- ((eq? 0 res)
- (delete-file* (conc mtdbfile ".backup"))
+ ((eq? 0 res )
+ (handle-exceptions
+ exn
+ (begin
+ (debug:print-info 0 *default-log-port* "Problem making backup db file, exn=" exn)
+ #t)
+ (if (file-exists? dbbackupfile)
+ (delete-file* dbbackupfile)
+ )
(if (eq? 0 (file-size sync-log))
(delete-file sync-log))
(system (conc "/bin/mv " staging-file " " mtdbfile))
-
+ )
(set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))
(set! off-time (calculate-off-time
last-sync-seconds
(cond
((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1))
Index: tests.scm
==================================================================
--- tests.scm
+++ tests.scm
@@ -1944,28 +1944,52 @@
tdb
"SELECT count(id) FROM test_rundat;")
res))
0)
-(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)
- (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))
- (if (and cpuload diskfree)
- (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
- (if minutes
- (rmt:general-call 'update-run-duration run-id minutes test-id))
- (if (and uname hostname)
- (rmt:general-call 'update-uname-host run-id uname hostname test-id)))
+;;
+(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f))
+ (if (get-environment-variable "MT_TEST_RUN_DIR")
+ (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data"))
+ (or-dash (lambda (instr)
+ (cond
+ ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken
+ ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr))
+ (else instr))))
+ (file-new (not (directory-exists? dest-dir))))
+ (if file-new (create-directory dest-dir #t))
+ (let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append)))
+ (with-output-to-port outp
+ (lambda ()
+ (if file-new
+ (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname"))
+ (print (current-seconds) "," (or-dash run-id) "," (or-dash test-id) ","
+ (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree) ","
+ (or-dash minutes) "," (or-dash hostname) ","
+ (or-dash uname)))) ;; put uname last as it has spaces in it
+ (close-output-port outp)))
+ (begin
+ (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))))
+ (if update-db
+ (begin
+ (if (and cpuload diskfree)
+ (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
+ (if minutes
+ (rmt:general-call 'update-run-duration run-id minutes test-id))
+ (if (and uname hostname)
+ (rmt:general-call 'update-uname-host run-id uname hostname test-id)))))
;; This one is for running with no db access (i.e. via rmt: internally)
-(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
+(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f))
;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
;; (let ((remtries 10))
(let* ((cpuload (get-cpu-load))
(diskfree (get-df (current-directory)))
+ (tmpfree (get-df "/tmp"))
(uname (get-uname "-srvpio"))
(hostname (get-host-name)))
- (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
+ (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree)))
;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
(let* ((cpuload (get-cpu-load))
(diskfree (get-df (current-directory)))