Megatest

db.scm at [da6321c924]
Login

File db.scm artifact d3cf8cbbe4 part of check-in da6321c924


;;======================================================================
;; Copyright 2006-2016, Matthew Welland.
;; 
;; This file is part of Megatest.
;; 
;;     Megatest is free software: you can redistribute it and/or modify
;;     it under the terms of the GNU General Public License as published by
;;     the Free Software Foundation, either version 3 of the License, or
;;     (at your option) any later version.
;; 
;;     Megatest is distributed in the hope that it will be useful,
;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;     GNU General Public License for more details.
;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
;;
;;======================================================================

;;======================================================================
;; Database access
;;======================================================================

;; dbstruct vector containing all the relevant dbs like main.db, megatest.db, run.db etc

(use (srfi 18) extras tcp stack)
(use sqlite3 srfi-1 posix regex regex-case srfi-69 csv-xml s11n md5 message-digest
     base64 format dot-locking z3 typed-records matchable)
(import (prefix sqlite3 sqlite3:))
(import (prefix base64 base64:))

(declare (unit db))
(declare (uses common))
(declare (uses keys))
;; (declare (uses ods))
(declare (uses client))
(declare (uses mt))

(declare (uses commonmod))
(import commonmod)

(declare (uses dbmod))
(import dbmod)

(include "common_records.scm")
(include "db_records.scm")
(include "key_records.scm")
(include "run_records.scm")

;; MUST RESOLVE mt:process-triggers before these can move to dbmod.

;; set tests with state currstate and status currstatus to newstate and newstatus
;; use currstate = #f and or currstatus = #f to apply to any state or status respectively
;; WARNING: SQL injection risk. NB// See new but not yet used "faster" version below
;;
;;  AND NOT (item_path='' AND testname in (SELECT DISTINCT testname FROM tests WHERE testname=? AND item_path != ''));")))
;;  (debug:print 0 *default-log-port* "QRY: " qry)
;;  (db:delay-if-busy)
;;
;; NB// This call only operates on toplevel tests. Consider replacing it with more general call
;;
(define (db:set-tests-state-status dbstruct run-id testnames currstate currstatus newstate newstatus)
  (let ((test-ids '()))
    (for-each
     (lambda (testname)
       (let ((qry (conc "UPDATE tests SET state=?,status=? WHERE "
			(if currstate  (conc "state='" currstate "' AND ") "")
			(if currstatus (conc "status='" currstatus "' AND ") "")
			" run_id=? AND testname LIKE ?;"))
	     (test-id (db:get-test-id dbstruct run-id testname "")))
	 (db:with-db
	  dbstruct
	  run-id
	  #t
	  (lambda (db)
	    (sqlite3:execute db qry
			     (or newstate  currstate "NOT_STARTED")
			     (or newstatus currstate "UNKNOWN")
			     run-id testname)))
	 (if test-id
	     (begin
	       (set! test-ids (cons test-id test-ids))
	       (mt:process-triggers dbstruct run-id test-id newstate newstatus)))))
     testnames)
    test-ids))

;; state is the priority rollup of all states
;; status is the priority rollup of all completed statesfu
;;
;; if test-name is an integer work off that instead of test-name test-path
;;
(define (db:set-state-status-and-roll-up-items dbstruct run-id test-name item-path state status comment)
  ;; establish info on incoming test followed by info on top level test
  ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met
  (let* ((testdat      (if (number? test-name)
			   (db:get-test-info-by-id dbstruct run-id test-name) ;; test-name is actually a test-id
			   (db:get-test-info       dbstruct run-id test-name item-path)))
	 (test-id      (db:test-get-id testdat))
	 (test-name    (if (number? test-name)
			   (db:test-get-testname testdat)
			   test-name))
	 (item-path    (db:test-get-item-path testdat))
         (tl-testdat   (db:get-test-info dbstruct run-id test-name ""))
         (tl-test-id   (if tl-testdat
			   (db:test-get-id tl-testdat)
			   #f)))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) 
	(db:general-call dbstruct 'set-test-start-time (list test-id)))
    (mutex-lock! *db-transaction-mutex*)
    (db:with-db
     dbstruct #f #f
     (lambda (db)
       (let ((tr-res
              (sqlite3:with-transaction
               db
               (lambda ()
                 ;; NB// Pass the db so it is part fo the transaction
                 (db:test-set-state-status db run-id test-id state status comment) ;; this call sets the item state/status
                 (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
                     (let* ((state-status-counts  (db:get-all-state-status-counts-for-test dbstruct run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
			      						  (state-stauses (db:roll-up-rules state-status-counts state status))
                          (newstate (car state-stauses))
                          (newstatus (cadr state-stauses)))
                       (debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path" newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
							(apply conc
                  (map (lambda (x)
                     (conc
                     		(with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
                              state-status-counts))); end debug:print
   
                       (if tl-test-id
			   (db:test-set-state-status db run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
		       ))))))
         (mutex-unlock! *db-transaction-mutex*)
         (if (and test-id state status (equal? status "AUTO")) 
             (db:test-data-rollup dbstruct run-id test-id status))
         tr-res)))))

;; ;; speed up for common cases with a little logic
;; ;; NB// Ultimately this will be deprecated in deference to mt:test-set-state-status-by-id
;;
;;      NOTE: run-id is not used
;; ;;
(define (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
  (db:with-db
   dbstruct
   ;; run-id
   #f
   #t
   (lambda (db)
     (cond
      ((and newstate newstatus newcomment)
       (sqlite3:execute db "UPDATE tests SET state=?,status=?,comment=? WHERE id=?;" newstate newstatus newcomment ;; (sdb:qry 'getid newcomment)
			test-id))
      ((and newstate newstatus)
       (sqlite3:execute db "UPDATE tests SET state=?,status=? WHERE id=?;" newstate newstatus test-id))
      (else
       (if newstate   (sqlite3:execute db "UPDATE tests SET state=?   WHERE id=?;" newstate   test-id))
       (if newstatus  (sqlite3:execute db "UPDATE tests SET status=?  WHERE id=?;" newstatus  test-id))
       (if newcomment (sqlite3:execute db "UPDATE tests SET comment=? WHERE id=?;" newcomment ;; (sdb:qry 'getid newcomment)
				       test-id))))))
  (mt:process-triggers dbstruct run-id test-id newstate newstatus))

(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
          ;; The default running-deadtime is 720 seconds = 12 minutes.
          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
         (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
         (server-start-allowance 200)
         (server-overloaded-budget 200)
         (launch-monitor-off-time (or (configf:lookup-number *configdat* "setup" "test-stats-update-period") 30))
         (launch-monitor-on-time-budget 30)
         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
         (running-deadtime (or deadtime-trim running-deadtime-default)) ;; two minutes (30 seconds between updates, this leaves 3x grace period)
         )
    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct #f #f
     (lambda (db)
       (let* ((stmth1 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('RUNNING');"))
	      (stmth2 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('REMOTEHOSTSTART');"))
	      (stmth3 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path FROM tests
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                          AND state IN ('LAUNCHED');")))
	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
	 ;;
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
       
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
				    " run-duration="run-duration)
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 
	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 ;;
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row
	  (lambda (test-id run-dir uname testname item-path)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
				    " 1 day since event_time marked")
                (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
	  stmth3
	  run-id)
       
	 (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
			   (length toplevels) " old LAUNCHED toplevel tests and "
			   (length incompleted) " tests marked RUNNING but apparently dead."))

	 ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
       ;;
	 ;; (db:delay-if-busy dbdat)
	 (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
		(all-ids             (append min-incompleted-ids (map car oldlaunched))))
	   (if (> (length all-ids) 0)
	       (begin
		 ;; (launch:is-test-alive "localhost" 435)
		 (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
			      " as DEAD")
		 (for-each
                  (lambda (test-id)
                    (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
			   (tinfo   (db:get-test-info-by-id dbstruct run-id test-id))
			   (run-dir (db:test-get-rundir     tinfo))
			   (host    (db:test-get-host       tinfo))
			   (pid     (db:test-get-process_id tinfo))
			   (result (db:get-status-from-final-status-file run-dir)))
		      (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
			  (begin
			    (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
			    (db:set-state-status-and-roll-up-items
			     dbstruct run-id test-id 'foo "COMPLETED" "PASS"
			     "Test stopped responding but it has PASSED; marking it PASS in the DB."))
			  (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
					       (launch:is-test-alive host pid))))
			    (if is-alive
				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
					     " has a process on pid " pid ", NOT setting to DEAD.")
				(begin
				  (debug:print 0 *default-log-port* "INFO: test " test-id
					       " final state/status is not COMPLETED/PASS. It is " result)
				  (db:set-state-status-and-roll-up-items
				   dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
				   "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
		  ;; call end of eud of run detection for posthook - from merge, is it needed?
		  ;; (launch:end-of-run-check run-id)
		  all-ids)

		 ;; MOVE TO rmt:find-and-mark-incomplete - for now always call launch:end-of-run-check after
		 ;; calling rmt:find-and-mark-incompletes
		 
		 ;;ALWAYS CALL after rmt:find-and-mark-incompletes
		 ;; call end of eud of run detection for posthook
		 ;; (launch:end-of-run-check run-id)

		 )))))))

(define (db:test-set-state-status-process-triggers  dbstruct run-id test-id newstate newstatus newcomment)
  (db:test-set-state-status dbstruct run-id test-id newstate newstatus newcomment)
  (mt:process-triggers dbstruct run-id test-id newstate newstatus))

;; ; This is to be the big daddy call NOPE: Replaced by db:set-state-status-and-roll-up-items
;; ;
(define (db:test-set-state-status dbstruct run-id test-id state status msg)
  (let ((dbdat  (db:get-db dbstruct run-id)))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART"))
	(db:general-call dbdat 'set-test-start-time (list test-id)))
    ;; (if msg
    ;; 	(db:general-call dbdat 'state-status-msg (list state status msg test-id))
    ;; 	(db:general-call dbdat 'state-status     (list state status test-id)))
    (db:set-state-status-and-roll-up-items dbstruct run-id test-id #f state status msg)
    ;; process the test_data table
    (if (and test-id state status (equal? status "AUTO")) 
	(db:test-data-rollup dbstruct run-id test-id status))
    (mt:process-triggers dbstruct run-id test-id state status)))

;; options:
;;
;;  'killservers  - kills all servers
;;  'dejunk       - removes junk records
;;  'adj-testids  - move test-ids into correct ranges
;;  'old2new      - sync megatest.db to /tmp/.../megatest.db and /tmp/.../megatest_ref.db
;;  'new2old      - sync /tmp/.../megatest.db to megatest.db and /tmp/.../megatest_ref.db (and update data_synced)
;;  'closeall     - close all opened dbs
;;  'schema       - attempt to apply schema changes
;;  run-ids: '(1 2 3 ...) or #f (for all)
;;
(define (db:multi-db-sync dbstruct . options)
  ;; (if (not (launch:setup))
  ;;    (debug:print 0 *default-log-port* "ERROR: not able to setup up for megatest.")
  (let* ((mtdb     (dbr:dbstruct-mtdb dbstruct))
	 (tmpdb    (db:get-db dbstruct))
	 (refndb   (dbr:dbstruct-refndb dbstruct))
	 (allow-cleanup #t) ;; (if run-ids #f #t))
	 (servers (server:get-list *toppath*)) ;; (tasks:get-all-servers (db:delay-if-busy tdbdat)))
	 (data-synced 0)) ;; count of changed records (I hope)
    
    (for-each
     (lambda (option)
       
       (case option
	 ;; kill servers
	 ((killservers)
	  (for-each
	   (lambda (server)
             (handle-exceptions
             exn
             (begin 
               (debug:print-info 0 *default-log-port*  "Unable to get host and/or port from " server ", exn=" exn)     
               #f)
	     (match-let (((mod-time host port start-time server-id pid) server))
	       (if (and host pid)
		   (tasks:kill-server host pid)))))
	   servers)

          ;; /tmp/bjbarcla/megatest_localdb/fullrun/.nfs.pdx.disks.icf_env_disk001.bjbarcla.gwa.issues.mtdev.matt-bisect.megatest.ext-tests.runs.all.v1.65.1.6524.dev.bb-24-justrollup-f8.rollup.fullrun/megatest.db.lock
          (delete-file* (common:get-sync-lock-filepath))
          )
	 
	 ;; clear out junk records
	 ;;
	 ((dejunk)
	  ;; (db:delay-if-busy mtdb) ;; ok to delay on mtdb
	  (when (file-write-access? (db:dbdat-get-path mtdb)) (db:clean-up mtdb))
	  (db:clean-up tmpdb)
	  (db:clean-up refndb))

	 ;; sync runs, test_meta etc.
	 ;;
	 ((old2new)
	  (set! data-synced
	    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f mtdb tmpdb refndb)
	       data-synced)))
	 
	 ;; now ensure all newdb data are synced to megatest.db
	 ;; do not use the run-ids list passed in to the function
	 ;;
	 ((new2old)
	  (set! data-synced
	    (+ (db:sync-tables (db:sync-all-tables-list dbstruct) #f tmpdb refndb mtdb)
	       data-synced)))

	 ((adj-target)
	  (db:adj-target (db:dbdat-get-db mtdb))
	  (db:adj-target (db:dbdat-get-db tmpdb))
	  (db:adj-target (db:dbdat-get-db refndb)))
	 
	 ((schema)
	  (db:patch-schema-maindb (db:dbdat-get-db mtdb))
	  (db:patch-schema-maindb (db:dbdat-get-db tmpdb))
	  (db:patch-schema-maindb (db:dbdat-get-db refndb))
	  (db:patch-schema-rundb  (db:dbdat-get-db mtdb))
	  (db:patch-schema-rundb  (db:dbdat-get-db tmpdb))
	  (db:patch-schema-rundb  (db:dbdat-get-db refndb))))
       
       (stack-push! (dbr:dbstruct-dbstack dbstruct) tmpdb))
     options)
    data-synced))

(define (db:get-access-mode)
  (if (args:get-arg "-use-db-cache") 'cached 'rmt))