;;======================================================================
;; Copyright 2006-2016, Matthew Welland.
;;
;; This file is part of Megatest.
;;
;; Megatest is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;;
;; Megatest is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see <http://www.gnu.org/licenses/>.
;;
;;======================================================================
;;======================================================================
;; Database access
;;======================================================================
;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc
(use (srfi 18) extras tcp stack)
(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest
base64 format dot-locking z3 typed-records matchable)
(import (prefix sqlite3 sqlite3:))
(import (prefix base64 base64:))
(declare (unit db))
(declare (uses common))
(declare (uses keys))
;; (declare (uses ods))
(declare (uses client))
(declare (uses mt))
(declare (uses commonmod))
(import commonmod)
(declare (uses dbmod))
(import dbmod)
(include "common_records.scm")
(include "db_records.scm")
(include "key_records.scm")
(include "run_records.scm")
;; MUST RESOLVE mt:process-triggers before these can move to dbmod.
;; set tests with state currstate and status currstatus to newstate and newstatus
;; use currstate = #f and or currstatus = #f to apply to any state or status respectively
;; WARNING: SQL injection risk. NB// See new but not yet used "faster" version below
;;
;; AND NOT (item_path='' AND testname in (SELECT DISTINCT testname FROM tests WHERE testname=? AND item_path != ''));")))
;; (debug:print 0 *default-log-port* "QRY: " qry)
;; (db:delay-if-busy)
;;
;; NB// This call only operates on toplevel tests. Consider replacing it with more general call
;;
(define (db:set-tests-state-status dbstruct run-id testnames currstate currstatus newstate newstatus)
(let ((test-ids '()))
(for-each
(lambda (testname)
(let ((qry (conc "UPDATE tests SET state=?,status=? WHERE "
(if currstate (conc "state='" currstate "' AND ") "")
(if currstatus (conc "status='" currstatus "' AND ") "")
" run_id=? AND testname LIKE ?;"))
(test-id (db:get-test-id dbstruct run-id testname "")))
(db:with-db
dbstruct
run-id
#t
(lambda (db)
(sqlite3:execute db qry
(or newstate currstate "NOT_STARTED")
(or newstatus currstate "UNKNOWN")
run-id testname)))
(if test-id
(begin
(set! test-ids (cons test-id test-ids))
(mt:process-triggers dbstruct run-id test-id newstate newstatus)))))
testnames)
test-ids))
;; state is the priority rollup of all states
;; status is the priority rollup of all completed statesfu
;;
;; if test-name is an integer work off that instead of test-name test-path
;;
(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
;; establish info on incoming test followed by info on top level test
;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met
(let* ((testdat (if (number? test-name)
(db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id
(db:get-test-info dbstruct run-id test-name item-path)))
(test-id (db:test-get-id testdat))
(test-name (if (number? test-name)
(db:test-get-testname testdat)
test-name))
(item-path (db:test-get-item-path testdat))
(tl-testdat (db:get-test-info dbstruct run-id test-name ""))
(tl-test-id (if tl-testdat
(db:test-get-id tl-testdat)
#f)))
(if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
(db:general-call dbstruct 'set-test-start-time (list test-id)))
(mutex-lock! *db-transaction-mutex*)
(db:with-db
dbstruct #f #f
(lambda (db)
(let ((tr-res
(sqlite3:with-transaction
db
(lambda ()
;; NB// Pass the db so it is part fo the transaction
(db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status
(if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
(let* ((state-status-counts (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
(state-stauses (db:roll-up-rules state-status-counts state status))
(newstate (car state-stauses))
(newstatus (cadr state-stauses)))
(debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts) " state-status-counts: "
(apply conc
(map (lambda (x)
(conc
(with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
state-status-counts))); end debug:print
(if tl-test-id
(db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
))))))
(mutex-unlock! *db-transaction-mutex*)
(if (and test-id state status (equal? status "AUTO"))
(db:test-data-rollup dbstruct run-id test-id status))
tr-res)))))
;; ;; speed up for common cases with a little logic
;; ;; NB// Ultimately this will be deprecated in deference to mt:test-set-state-status-by-id
;;
;; NOTE: run-id is not used
;; ;;
(define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
(db:with-db
dbstruct
;; run-id
#f
#t
(lambda (db)
(cond
((and newstate newstatus newcomment)
(sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
test-id))
((and newstate newstatus)
(sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id))
(else
(if newstate (sqlite3:execute db "UPDATE tests SET state=? WHERE id=?;" newstate test-id))
(if newstatus (sqlite3:execute db "UPDATE tests SET status=? WHERE id=?;" newstatus test-id))
(if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment)
test-id))))))
(mt:process-triggers dbstruct run-id test-id newstate newstatus))
(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
(let* ((incompleted '())
(oldlaunched '())
(toplevels '())
;; The default running-deadtime is 720 seconds = 12 minutes.
;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
(deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
(server-start-allowance 200)
(server-overloaded-budget 200)
(launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
(launch-monitor-on-time-budget 30)
(launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
(remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
(remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
(running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
)
(debug:print-info 4 *default-log-port* "running-deadtime = " running-deadtime)
(debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim)
(db:with-db
dbstruct #f #f
(lambda (db)
(let* ((stmth1 (db:get-cache-stmth
dbstruct db
"SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests
WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
AND state IN ('RUNNING');"))
(stmth2 (db:get-cache-stmth
dbstruct db
"SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests
WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
AND state IN ('REMOTEHOSTSTART');"))
(stmth3 (db:get-cache-stmth
dbstruct db
"SELECT id,rundir,uname,testname,item_path FROM tests
WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
AND state IN ('LAUNCHED');")))
;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
;;
;; HOWEVER: this code in run:test seems to work fine
;; (> (- (current-seconds)(+ (db:test-get-event_time testdat)
;; (db:test-get-run_duration testdat)))
;; 600)
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
(debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
" event-time="event-time" run-duration="run-duration))))
stmth1
run-id running-deadtime) ;; default time 720 seconds
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
" run-duration="run-duration)
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
stmth2
run-id remotehoststart-deadtime) ;; default time 230 seconds
;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
;;
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(begin
(debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
" 1 day since event_time marked")
(set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
stmth3
run-id)
(debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
(length toplevels) " old LAUNCHED toplevel tests and "
(length incompleted) " tests marked RUNNING but apparently dead."))
;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
;;
;; (db:delay-if-busy dbdat)
(let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
(all-ids (append min-incompleted-ids (map car oldlaunched))))
(if (> (length all-ids) 0)
(begin
;; (launch:is-test-alive "localhost" 435)
(debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
" as DEAD")
(for-each
(lambda (test-id)
(let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
(tinfo (db:get-test-info-by-id dbstruct run-id test-id))
(run-dir (db:test-get-rundir tinfo))
(host (db:test-get-host tinfo))
(pid (db:test-get-process_id tinfo))
(result (db:get-status-from-final-status-file run-dir)))
(if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result)))
(begin
(debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
(db:set-state-status-and-roll-up-items
dbstruct run-id test-id 'foo "COMPLETED" "PASS"
"Test stopped responding but it has PASSED; marking it PASS in the DB."))
(let ((is-alive (and (not (eq? pid 0)) ;; 0 is default in re-used field "attemptnum" where pid stored.
(launch:is-test-alive host pid))))
(if is-alive
(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
" has a process on pid " pid ", NOT setting to DEAD.")
(begin
(debug:print 0 *default-log-port* "INFO: test " test-id
" final state/status is not COMPLETED/PASS. It is " result)
(db:set-state-status-and-roll-up-items
dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
"Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
;; call end of eud of run detection for posthook - from merge, is it needed?
;; (launch:end-of-run-check run-id)
all-ids)
;; MOVE TO rmt:find-and-mark-incomplete - for now always call launch:end-of-run-check after
;; calling rmt:find-and-mark-incompletes
;;ALWAYS CALL after rmt:find-and-mark-incompletes
;; call end of eud of run detection for posthook
;; (launch:end-of-run-check run-id)
)))))))
(define (db:test-set-state-status-process-triggers dbstruct run-id test-id newstate newstatus newcomment)
(db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
(mt:process-triggers dbstruct run-id test-id newstate newstatus))
;; options:
;;
;; 'killservers - kills all servers
;; 'dejunk - removes junk records
;; 'adj-testids - move test-ids into correct ranges
;; 'old2new - sync megatest.db to /tmp/.../megatest.db and /tmp/.../megatest_ref.db
;; 'new2old - sync /tmp/.../megatest.db to megatest.db and /tmp/.../megatest_ref.db (and update data_synced)
;; 'closeall - close all opened dbs
;; 'schema - attempt to apply schema changes
;; run-ids: '(1 2 3 ...) or #f (for all)
;;
(define (db:multi-db-sync dbstruct . options)
;; (if (not (launch:setup))
;; (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.")
(let* ((mtdb (dbr:dbstruct-mtdb dbstruct))
(tmpdb (db:get-db dbstruct))
(refndb (dbr:dbstruct-refndb dbstruct))
(allow-cleanup #t) ;; (if run-ids #f #t))
(servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))
(data-synced 0)) ;; count of changed records (I hope)
(for-each
(lambda (option)
(case option
;; kill servers
((killservers)
(for-each
(lambda (server)
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Unable to get host and/or port from " server ", exn=" exn)
#f)
(match-let (((mod-time host port start-time server-id pid) server))
(if (and host pid)
(tasks:kill-server host pid)))))
servers)
;; /tmp/bjbarcla/megatest_localdb/fullrun/.nfs.pdx.disks.icf_env_disk001.bjbarcla.gwa.issues.mtdev.matt-bisect.megatest.ext-tests.runs.all.v1.65.1.6524.dev.bb-24-justrollup-f8.rollup.fullrun/megatest.db.lock
(delete-file* (common:get-sync-lock-filepath))
)
;; clear out junk records
;;
((dejunk)
;; (db:delay-if-busy mtdb) ;; ok to delay on mtdb
(when (file-write-access? (db:dbdat-get-path mtdb)) (db:clean-up mtdb))
(db:clean-up tmpdb)
(db:clean-up refndb))
;; sync runs, test_meta etc.
;;
((old2new)
(set! data-synced
(+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb)
data-synced)))
;; now ensure all newdb data are synced to megatest.db
;; do not use the run-ids list passed in to the function
;;
((new2old)
(set! data-synced
(+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb)
data-synced)))
((adj-target)
(db:adj-target (db:dbdat-get-db mtdb))
(db:adj-target (db:dbdat-get-db tmpdb))
(db:adj-target (db:dbdat-get-db refndb)))
((schema)
(db:patch-schema-maindb (db:dbdat-get-db mtdb))
(db:patch-schema-maindb (db:dbdat-get-db tmpdb))
(db:patch-schema-maindb (db:dbdat-get-db refndb))
(db:patch-schema-rundb (db:dbdat-get-db mtdb))
(db:patch-schema-rundb (db:dbdat-get-db tmpdb))
(db:patch-schema-rundb (db:dbdat-get-db refndb))))
(stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))
options)
data-synced))
(define (db:get-access-mode)
(if (args:get-arg "-use-db-cache") 'cached 'rmt))