Index: api.scm
==================================================================
--- api.scm
+++ api.scm
@@ -157,10 +157,15 @@
(params (vector-ref dat 1))
(start-t (current-milliseconds))
(readonly-mode (dbr:dbstruct-read-only dbstruct))
(readonly-command (member cmd api:read-only-queries))
(writecmd-in-readonly-mode (and readonly-mode (not readonly-command)))
+ (foo (begin
+ (common:telemetry-log (conc "api-in:"(->string cmd))
+ payload: `((params . ,params)))
+
+ #t))
(res
(if writecmd-in-readonly-mode
(conc "attempt to run write command "cmd" on a read-only database")
(case cmd
;;===============================================
@@ -327,19 +332,28 @@
;; TASKS
((find-task-queue-records) (apply tasks:find-task-queue-records dbstruct params))
(else
(debug:print 0 *default-log-port* "ERROR: bad api call " cmd)
(conc "ERROR: BAD api call " cmd))))))
+
;; save all stats
(let ((delta-t (- (current-milliseconds)
start-t)))
(hash-table-set! *db-api-call-time* cmd
(cons delta-t (hash-table-ref/default *db-api-call-time* cmd '()))))
(if writecmd-in-readonly-mode
- (vector #f res)
- (vector #t res)))))))
+ (begin
+ (common:telemetry-log (conc "api-out:"(->string cmd))
+ payload: `((params . ,params)
+ (ok-res . #t)))
+ (vector #f res))
+ (begin
+ (common:telemetry-log (conc "api-out:"(->string cmd))
+ payload: `((params . ,params)
+ (ok-res . #f)))
+ (vector #t res))))))))
;; http-server send-response
;; api:process-request
;; db:*
;;
Index: common.scm
==================================================================
--- common.scm
+++ common.scm
@@ -17,11 +17,11 @@
;; along with Megatest. If not, see .
;;======================================================================
(use srfi-1 data-structures posix regex-case (prefix base64 base64:)
- format dot-locking csv-xml z3 ;; sql-de-lite
+ format dot-locking csv-xml z3 udp ;; sql-de-lite
hostinfo md5 message-digest typed-records directory-utils stack
matchable regex posix (srfi 18) extras ;; tcp
(prefix nanomsg nmsg:)
(prefix sqlite3 sqlite3:)
pkts (prefix dbi dbi:)
@@ -81,10 +81,11 @@
(length (glob (conc "/proc/" pid "/fd/*")))
(length (filter identity (map socket? (glob (conc "/proc/" pid "/fd/*")))))
)
)
+
;; GLOBALS
;; CONTEXTS
(defstruct cxt
@@ -887,10 +888,11 @@
(debug:print-info 13 *default-log-port* "watchdog done."))
(debug:print-info 13 *default-log-port* "no need for watchdog on non-homehost"))))
(define (std-exit-procedure)
+ ;;(common:telemetry-log-close)
(on-exit (lambda () 0))
;;(debug:print-info 13 *default-log-port* "std-exit-procedure called; *time-to-exit*="*time-to-exit*)
(let ((no-hurry (if *time-to-exit* ;; hurry up
#f
(begin
@@ -3049,5 +3051,69 @@
exn
#t ;; just ignore it, it might have died in the meantime so joining it will throw an exception
(thread-join! thread))
)))
(hash-table-keys *common:thread-punchlist*)))
+
+(define *common:telemetry-log-state* 'startup)
+(define *common:telemetry-log-socket* #f)
+
+(define (common:telemetry-log-open)
+ (if (eq? *common:telemetry-log-state* 'startup)
+ (let* ((serverhost (configf:lookup *configdat* "telemetry" "host"))
+ (serverport (configf:lookup-number *configdat* "telemetry" "port"))
+ (user (or (get-environment-variable "USER") "unknown"))
+ (host (or (get-environment-variable "HOST") "unknown")))
+ (set! *common:telemetry-log-state*
+ (handle-exceptions
+ exn
+ (begin
+ (debug:print-info 0 *default-log-port* "common-telemetry-log open udp port failure")
+ 'broken)
+ (if (and serverhost serverport user host)
+ (let* ((s (udp-open-socket)))
+ ;;(udp-bind! s #f 0)
+ (udp-connect! s serverhost serverport)
+ (set! *common:telemetry-log-socket* s)
+ 'open)
+ 'not-needed))))))
+
+(define (common:telemetry-log event #!key (payload '()))
+ (if (eq? *common:telemetry-log-state* 'startup)
+ (common:telemetry-log-open))
+
+ (if (eq? 'open *common:telemetry-log-state*)
+ (handle-exceptions
+ exn
+ (begin
+ (debug:print-info 0 *default-log-port* "common-telemetry-log comms failure ; disabled (no server?)")
+ ;;(define *common:telemetry-log-state* 'broken-or-no-server-preclose)
+ ;;(common:telemetry-log-close)
+ (define *common:telemetry-log-state* 'broken-or-no-server)
+ (set! *common:telemetry-log-socket* #f)
+ )
+ (if (and *common:telemetry-log-socket* event)
+ (let* ((user (or (get-environment-variable "USER") "unknown"))
+ (host (or (get-environment-variable "HOST") "unknown"))
+ (start (conc "[megatest "event"]"))
+ (toppath (or *toppath* "/dev/null"))
+ (payload-serialized
+ (base64:base64-encode
+ (z3:encode-buffer
+ (with-output-to-string (lambda () (pp payload))))))
+ (msg (conc user":"host":"start":"(current-process-id)":"(car (argv))":"
+ toppath":"payload-serialized)))
+ (udp-send *common:telemetry-log-socket* msg))))))
+
+(define (common:telemetry-log-close)
+ (when (or (member *common:telemetry-log-state* '(broken-or-no-server-preclose open)) *common:telemetry-log-socket*)
+ (handle-exceptions
+ exn
+ (begin
+ (define *common:telemetry-log-state* 'closed-fail)
+ (debug:print-info 0 *default-log-port* "common-telemetry-log closure failure")
+ )
+ (begin
+ (define *common:telemetry-log-state* 'closed)
+ (udp-close-socket *common:telemetry-log-socket*)
+ (set! *common:telemetry-log-socket* #f)))))
+
Index: configf.scm
==================================================================
--- configf.scm
+++ configf.scm
@@ -774,14 +774,14 @@
ht))
;; if
(define (configf:read-alist fname)
(handle-exceptions
- exn
- #f
- (configf:alist->config
- (with-input-from-file fname read))))
+ exn
+ #f
+ (configf:alist->config
+ (with-input-from-file fname read))))
(define (configf:write-alist cdat fname)
(if (not (common:faux-lock fname))
(debug:print 0 *default-log-port* "INFO: Could not get lock on " fname))
(let* ((dat (configf:config->alist cdat))
Index: db.scm
==================================================================
--- db.scm
+++ db.scm
@@ -406,10 +406,13 @@
;; Open the classic megatest.db file (defaults to open in toppath)
;;
;; NOTE: returns a dbdat not a dbstruct!
;;
+
+;;(define (db:reopen-megatest-db
+
(define (db:open-megatest-db #!key (path #f)(name #f))
(let* ((dbdir (or path *toppath*))
(dbpath (conc dbdir "/" (or name "megatest.db")))
(dbexists (common:file-exists? dbpath))
(db (db:lock-create-open dbpath
@@ -1642,39 +1645,61 @@
(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
(let* ((incompleted '())
(oldlaunched '())
(toplevels '())
- (deadtime-str (configf:lookup *configdat* "setup" "deadtime")) ;; FIXME suspect test run time & deadtime are not well matched; resulting in COMPLETED/DEAD status of an a-ok running test
- (deadtime (if (and deadtime-str
- (string->number deadtime-str))
- (string->number deadtime-str)
- 7200))) ;; two hours
+ (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
+ (server-start-allowance 200)
+ (server-overloaded-budget 200)
+ (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
+ (launch-monitor-on-time-budget 30)
+ (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
+ (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
+ (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
+ (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
+ (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
+ )
(db:with-db
dbstruct #f #f
(lambda (db)
- (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
-
;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
;;
;; HOWEVER: this code in run:test seems to work fine
;; (> (- (current-seconds)(+ (db:test-get-event_time testdat)
;; (db:test-get-run_duration testdat)))
;; 600)
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
- (lambda (test-id run-dir uname testname item-path)
+ (lambda (test-id run-dir uname testname item-path event-time run-duration)
+ (if (and (equal? uname "n/a")
+ (equal? item-path "")) ;; this is a toplevel test
+ ;; what to do with toplevel? call rollup?
+ (begin
+ (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
+ (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
+ (begin
+ (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
+ (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
+ db
+ "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
+ run-id running-deadtime)
+
+
+ (sqlite3:for-each-row
+ (lambda (test-id run-dir uname testname item-path event-time run-duration)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
- (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
+ (begin
+ (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)
+ (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
db
- "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING','REMOTEHOSTSTART');"
- run-id deadtime)
+ "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
+ run-id remotehoststart-deadtime)
;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
;;
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
@@ -1681,11 +1706,13 @@
(lambda (test-id run-dir uname testname item-path)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
- (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
+ (begin
+ (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked")
+ (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
db
"SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
run-id)
(debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
@@ -1702,15 +1729,15 @@
;; incompleted))
(min-incompleted-ids (map car incompleted)) ;; do 'em all
(all-ids (append min-incompleted-ids (map car oldlaunched))))
(if (> (length all-ids) 0)
(begin
- (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE")
+ (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")
(for-each
(lambda (test-id)
- (db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test failed to complete")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS. ref ticket 220546828
-
+ (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead."))
+ ;;(db:test-set-state-status dbstruct run-id test-id "COMPLETED" "DEAD" "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")) ;; fix for one aspect of Randy's ticket 1405717332 ;; TODO - fix problem where test goes to COMPLETED/DEAD while in progress, only later to go to COMPLETED/PASS. ref ticket 220546828
all-ids))))))))
;; ALL REPLACED BY THE BLOCK ABOVE
;;
;; (sqlite3:execute
@@ -3680,11 +3707,11 @@
(item-path (db:test-get-item-path testdat))
(tl-testdat (db:get-test-info dbstruct run-id test-name ""))
(tl-test-id (if tl-testdat
(db:test-get-id tl-testdat)
#f)))
- (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
+ (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
(db:general-call dbstruct 'set-test-start-time (list test-id)))
(mutex-lock! *db-transaction-mutex*)
(db:with-db
dbstruct #f #f
(lambda (db)
Index: docs/manual/megatest_manual.html
==================================================================
--- docs/manual/megatest_manual.html
+++ docs/manual/megatest_manual.html
@@ -900,69 +900,10 @@
sqlite3 database. Megatest has been used with the Intel Netbatch and
lsf (also known as openlava) batch systems and it should be
straightforward to use it with other similar systems.
-
-
-
Overview
-
-
-
Stand-alone Megatest Area
-
A single, stand-alone, Megatest based testsuite or "area" is
-sufficient for most validation, automation and build problems.
-
-
-
-
-
-
Megatest is designed as a distributed or decoupled system. This means
-you can run the areas stand-alone with no additional
-infrastructure. I.e. there are no databases, web servers or other
-centralized resources needed. However as your needs grow you can
-integrate multiple areas into a bigger system.
-
-
Component Descriptions
-
--
-
-Multi-area dashboard and xterm. A gui (the dashboard) is usually the
- best option for controlling and launching runs but all operations
- can also be done from the commandline. Note: The not yet released
- multi-area dashboard replaces the old dashboard for browsing and
- controlling runs but for managing a single area the old dashboard
- works very well.
-
-
--
-
-Area/testsuite. This is your testsuite or automation definition and
- consists of the information in megatest.config, runconfigs.config
- and your testconfigs along with any custom scripting that can’t be
- done with the native Megatest features.
-
-
--
-
-If your testsuite or build automation is too large to run on a
- single instance you can distribute your jobs into a compute server
- pool. The only current requirements are password-less ssh access and
- a network filesystem.
-
-
-
-
-
-
-
Full System Architecture
-
-
-
-
-
-
-