Megatest: mt.scm at [907e020fcf]

File mt.scm artifact 321551147d part of check-in 907e020fcf

;; Copyright 2006-2013, Matthew Welland.
;; 
;; This file is part of Megatest.
;; 
;;     Megatest is free software: you can redistribute it and/or modify
;;     it under the terms of the GNU General Public License as published by
;;     the Free Software Foundation, either version 3 of the License, or
;;     (at your option) any later version.
;; 
;;     Megatest is distributed in the hope that it will be useful,
;;     but WITHOUT ANY WARRANTY; without even the implied warranty of
;;     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;     GNU General Public License for more details.
;; 
;;     You should have received a copy of the GNU General Public License
;;     along with Megatest.  If not, see <http://www.gnu.org/licenses/>.
;;


(use sqlite3 srfi-1 posix regex regex-case srfi-69 dot-locking (srfi 18) posix-extras directory-utils call-with-environment-variables)
(import (prefix sqlite3 sqlite3:))

(declare (unit mt))
(declare (uses debugprint))
(declare (uses db))
(declare (uses dbmod))
(declare (uses common))
(declare (uses items))
(declare (uses runconfig))
(declare (uses tests))
(declare (uses server))
(declare (uses runs))
(declare (uses rmt))
(declare (uses rmtmod))

(import debugprint
	rmtmod
	dbmod)

(include "common_records.scm")
(include "key_records.scm")
(include "db_records.scm")
(include "run_records.scm")
(include "test_records.scm")

;; This is the Megatest API. All generally "useful" routines will be wrapped or extended
;; here.

;;======================================================================
;;  R U N S
;;======================================================================

;; runs:get-runs-by-patt
;; get runs by list of criteria
;; register a test run with the db
;;
;; Use: (db-get-value-by-header (db:get-header runinfo)(db:get-rows runinfo))
;;  to extract info from the structure returned
;;
(define (mt:get-runs-by-patt keys runnamepatt targpatt)
  (let loop ((runsdat  (rmt:get-runs-by-patt keys runnamepatt targpatt 0 500 #f 0))
	     (res      '())
	     (offset   0)
	     (limit    500))
    ;; (print "runsdat: " runsdat)
    (let* ((header    (vector-ref runsdat 0))
	   (runslst   (vector-ref runsdat 1))
	   (full-list (append res runslst))
	   (have-more (eq? (length runslst) limit)))
      ;; (debug:print 0 *default-log-port* "header: " header " runslst: " runslst " have-more: " have-more)
      (if have-more 
	  (let ((new-offset (+ offset limit))
		(next-batch (rmt:get-runs-by-patt keys runnamepatt targpatt offset limit #f 0)))
	    (debug:print-info 4 *default-log-port* "More than " limit " runs, have " (length full-list) " runs so far.")
	    (debug:print-info 0 *default-log-port* "next-batch: " next-batch)
	    (loop next-batch
		  full-list
		  new-offset
		  limit))
	 (vector header full-list)))))

;;======================================================================
;;  T E S T S
;;======================================================================

(define (mt:get-tests-for-run run-id testpatt states status #!key (not-in #t) (sort-by 'event_time) (sort-order "ASC") (qryvals #f)(last-update #f))
  (let loop ((testsdat (rmt:get-tests-for-run run-id testpatt states status 0 500 not-in sort-by sort-order qryvals last-update 'normal))
	     (res      '())
	     (offset   0)
	     (limit    500))
    (let* ((full-list (append res testsdat))
	   (have-more (eq? (length testsdat) limit)))
      (if have-more 
	  (let ((new-offset (+ offset limit)))
	    (debug:print-info 4 *default-log-port* "More than " limit " tests, have " (length full-list) " tests so far.")
	    (loop (rmt:get-tests-for-run run-id testpatt states status new-offset limit not-in sort-by sort-order qryvals last-update 'normal)
		  full-list
		  new-offset
		  limit))
	  full-list))))

(define (mt:lazy-get-prereqs-not-met run-id waitons ref-item-path #!key (mode '(normal))(itemmaps #f) )
  (let* ((key    (list run-id waitons ref-item-path mode))
	 (res    (hash-table-ref/default *pre-reqs-met-cache* key #f))
	 (useres (let ((last-time (if (vector? res) (vector-ref res 0) #f)))
		   (if last-time
		       (< (current-seconds)(+ last-time 5))
		       #f))))
    (if useres
	(let ((result (vector-ref res 1)))
	  (debug:print 4 *default-log-port* "Using lazy value res: " result)
	  result)
	(let ((newres (rmt:get-prereqs-not-met run-id waitons ref-item-path mode: mode itemmaps: itemmaps)))
	  (hash-table-set! *pre-reqs-met-cache* key (vector (current-seconds) newres))
	  newres))))

(define (mt:get-run-stats dbstruct run-id)
;;  Get run stats from local access, move this ... but where?
  (db:get-run-stats dbstruct run-id))

(define (mt:discard-blocked-tests run-id failed-test tests test-records)
  (if (null? tests)
      tests
      (begin
	(debug:print-info 1 *default-log-port* "Discarding tests from " tests " that are waiting on " failed-test)
	(let loop ((testn (car tests))
		   (remt  (cdr tests))
		   (res   '()))
	  (let* ((test-dat (hash-table-ref/default test-records testn (vector #f #f '())))
		 (waitons  (vector-ref test-dat 2)))
	    ;; (print "mt:discard-blocked-tests run-id: " run-id " failed-test: " failed-test " testn: " testn " with waitons: " waitons)
	    (if (null? remt)
		(let ((new-res (reverse res)))
		  ;; (print "       new-res: " new-res)
		  new-res)
		(loop (car remt)
		      (cdr remt)
		      (if (member failed-test waitons)
			  (begin
			    (debug:print 0 *default-log-port* "Discarding test " testn "(" test-dat ") due to " failed-test)
			    res)
			  (cons testn res)))))))))

;;======================================================================
;;  T R I G G E R S
;;======================================================================

(define (mt:run-trigger cmd test-id test-rundir trigger logname test-name item-path event-time actual-state actual-status target runname)
  ;; Putting the commandline into ( )'s means no control over the shell. 
  ;; stdout and stderr will be caught in the NBFAKE or mt_launch.log files
  ;; or equivalent. No need to do this. Just run it?
  (let* ((new-trigger-format (configf:lookup *configdat* "setup" "new-trigger-format"))
         (fullcmd 
          (if (and new-trigger-format (string=? new-trigger-format "yes"))
            (conc "nbfake "
			cmd           " "
			test-id       " "
			test-rundir   " "
			trigger       " "
			actual-state  " "
			actual-status " "
			event-time    " "
                        target        " "
                        runname       " "
			test-name     " "
			item-path
			)
            (conc "nbfake "
			cmd           " "
			test-id       " "
			test-rundir   " "
			trigger       " "
			test-name     " "
			item-path     " " 
			actual-state  " "
			actual-status " "
			event-time
			)
          ))
	 (prev-nbfake-log (get-environment-variable "NBFAKE_LOG")))
    (setenv "NBFAKE_LOG" (conc (cond
				((and (directory-exists? test-rundir)
				      (file-write-access? test-rundir))
				 test-rundir)
				((and (directory-exists? *toppath*)
				      (file-write-access? *toppath*))
				 *toppath*)
				(else (conc "/tmp/" (current-user-name))))
			       "/" logname))
    (debug:print-info 0 *default-log-port* "TRIGGERED on " trigger ", running command " fullcmd " output at " (get-environment-variable "NBFAKE_LOG"))
    (process-run fullcmd)
    (if prev-nbfake-log
	(setenv "NBFAKE_LOG" prev-nbfake-log)
	(unsetenv "NBFAKE_LOG"))
    ))

(define (mt:process-triggers run-id test-id newstate newstatus)
  (if test-id 
      (let* ((test-dat      (rmt:get-test-info-by-id run-id test-id)))
	(if test-dat
	    (let* ((test-rundir   (db:test-get-rundir       test-dat)) ;; ) ;; )
		   (test-name     (db:test-get-testname     test-dat))
		   (item-path     (db:test-get-item-path    test-dat))
		   (duration      (db:test-get-run_duration test-dat))
		   (comment       (db:test-get-comment      test-dat))
		   (event-time    (db:test-get-event_time   test-dat))
		   (tconfig       #f)
		   (state         (if newstate  newstate  (db:test-get-state  test-dat)))
		   (status        (if newstatus newstatus (db:test-get-status test-dat)))
                   (target        (getenv "MT_TARGET"))
                   (runname       (getenv "MT_RUNNAME"))) 
	      ;; (mutex-lock! *triggers-mutex*)
              (handle-exceptions
               exn
               (begin
                 (debug:print-error 0 *default-log-port* " Exception in mt:process-triggers for run-id="run-id" test-id="test-id" newstate="newstate" newstatus="newstatus
                                    "\n   error: " ((condition-property-accessor 'exn 'message) exn) ", exn=" exn
                                    "\n   test-rundir="test-rundir
                                    "\n   test-name="test-name
                                    "\n   item-path="item-path
                                    "\n   state="state
                                    "\n   status="status
                                    "\n")
                 (print-call-chain (current-error-port))
                 #f)
               (if (and test-name
                        test-rundir)   ;; #f means no dir set yet
                   ;; (common:file-exists? test-rundir)
                   ;; (directory? test-rundir))
                   (call-with-environment-variables
                    (list (cons "MT_TEST_NAME"    (or test-name "no such test"))
                          (cons "MT_TEST_RUN_DIR" (or test-rundir "no test directory yet"))
                          (cons "MT_ITEMPATH"     (or item-path "")))
                    (lambda ()
                      (if (directory-exists? test-rundir)
                          (push-directory test-rundir)
                          (push-directory *toppath*))
                      (set! tconfig (mt:lazy-read-test-config test-name))
                      (for-each (lambda (trigger)
                                  (let* ((munged-trigger (string-translate trigger "/ " "--"))
					(logname        (conc "last-trigger-" munged-trigger ".log")))
                                    ;; first any triggers from the testconfig
                                    (let ((cmd  (configf:lookup tconfig "triggers" trigger)))
                                      (if cmd (mt:run-trigger cmd test-id test-rundir trigger (conc "tconfig-" logname) test-name item-path event-time state status target runname)))
                                    ;; next any triggers from megatest.config
                                    (let ((cmd  (configf:lookup *configdat* "triggers" trigger)))
                                      (if cmd (mt:run-trigger cmd test-id test-rundir trigger (conc "mtconfig-" logname) test-name item-path event-time state status target runname)))))
                                (list
                                 (conc state "/" status)
                                 (conc state "/")
                                 (conc "/" status)))
		     (pop-directory))
                    )))
	      ;; (mutex-unlock! *triggers-mutex*)
	      )))))

;;======================================================================
;;  S T A T E   A N D   S T A T U S   F O R   T E S T S 
;;======================================================================

;; speed up for common cases with a little logic
(define (mt:test-set-state-status-by-id run-id test-id newstate newstatus newcomment)
  (if (not (and run-id test-id))
      (begin
	(debug:print-error 0 *default-log-port* "bad data handed to mt:test-set-state-status-by-id, run-id=" run-id ", test-id=" test-id ", newstate=" newstate)
	(print-call-chain (current-error-port))
	#f)
      (begin
	(rmt:set-state-status-and-roll-up-items run-id test-id #f newstate newstatus newcomment)
	#t)))


(define (mt:test-set-state-status-by-id-unless-completed run-id test-id newstate newstatus newcomment)
  (let* ((test-vec   (rmt:get-testinfo-state-status run-id test-id))
         (state     (vector-ref test-vec 3)))
    (if (equal? state "COMPLETED")
        #t
        (rmt:set-state-status-and-roll-up-items run-id test-id #f newstate newstatus newcomment))))

  
(define (mt:test-set-state-status-by-testname run-id test-name item-path new-state new-status new-comment)
  ;(let ((test-id (rmt:get-test-id run-id test-name item-path)))
  (rmt:set-state-status-and-roll-up-items run-id test-name item-path new-state new-status new-comment)
  ;; (mt:process-triggers run-id test-id new-state new-status)
  #t);)
	;;(mt:test-set-state-status-by-id run-id test-id new-state new-status new-comment)))

(define (mt:test-set-state-status-by-testname-unless-completed run-id test-name item-path new-state new-status new-comment)
  (let ((test-id (rmt:get-test-id run-id test-name item-path)))
    (mt:test-set-state-status-by-id-unless-completed run-id test-id new-state new-status new-comment)))

;; state and status are extra hints not usually used in the calculation
;;
(define (rmt:set-state-status-and-roll-up-items run-id test-name item-path state status comment)
  (assert (number? run-id) "FATAL: Run id required.")
  (rmt:client-side-set-state-status-and-roll-up  run-id test-name item-path state status comment)
  ;; (rmtmod:send-receive 'set-state-status-and-roll-up-items run-id (list run-id test-name item-path state status comment))
  )

(define (rmt:client-side-set-state-status-and-roll-up  run-id test-name item-path state status comment)
  ;; establish info on incoming test followed by info on top level test
  ;; BBnote - for mode itemwait, linkage between upstream test & matching item status is propagated to run queue in db:prereqs-not-met
  (let* ((test-id      (if (number? test-name)
			   test-name
			   (db:keep-trying-until-true
			    rmt:get-test-id
			    (list run-id test-name item-path)
			    10)))
			   ;; (rmt:get-test-id run-id test-name item-path)))
	 (testdat      (rmt:get-test-info-by-id run-id test-id))
	 ;; (test-id      (db:test-get-id testdat))
	 (test-name    (if (number? test-name)
			   (db:test-get-testname testdat)
			   test-name))
	 (item-path    (db:test-get-item-path testdat))
	 (tl-test-id   (rmt:get-test-id run-id test-name ""))
         (tl-testdat   (rmt:get-test-info-by-id run-id test-id))
	 (new-state-eh #f)
	 (new-status-eh #f))
    (if (member state '("LAUNCHED" "REMOTEHOSTSTART")) 
	(rmt:general-call 'set-test-start-time run-id test-id))
    (let* ((res (begin
		  (rmt:test-set-state-status run-id test-id state status comment) ;; this call sets the item state/status
		  (if (not (equal? item-path "")) ;; only roll up IF incoming test is an item
		      (let* ((state-status-counts (rmt:get-all-state-status-counts-for-test run-id test-name item-path state status)) ;; item-path is used to exclude current state/status of THIS test
			     (state-statuses      (db:roll-up-rules state-status-counts state status))
			     (newstate            (car state-statuses))
			     (newstatus           (cadr state-statuses)))
			(set! new-state-eh newstate)
			(set! new-status-eh newstatus)
			(debug:print 4 *default-log-port* "BB> tl-test-id="tl-test-id" ; "test-name":"item-path
				     " newstate="newstate" newstatus="newstatus" len(sscs)="(length state-status-counts)  " state-status-counts: "
				     (apply conc
					    (map (lambda (x)
						   (conc
						    (with-output-to-string (lambda () (pp (dbr:counts->alist x)))) " | "))
						 state-status-counts))); end debug:print
			(if tl-test-id
			    (rmt:test-set-state-status run-id tl-test-id newstate newstatus #f)) ;; we are still in the transaction - must access the db and not the dbstruct
		       )))))
      (if (and test-id state status (equal? status "AUTO")) 
	  (rmt:test-data-rollup run-id test-id status))
      (if new-state-eh ;; moved from db:test-set-state-status
	  (mt:process-triggers run-id test-id new-state-eh new-status-eh))
      res)))

;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
;;
;; NOT EASY TO MIGRATE TO db{file,mod}
;;
(define (rmt:find-and-mark-incomplete-engine run-id ovr-deadtime cfg-deadtime test-stats-update-period)
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
          ;; The default running-deadtime is 720 seconds = 12 minutes.
          ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
         (deadtime-trim (or ovr-deadtime cfg-deadtime))
         (server-start-allowance 200)
         (server-overloaded-budget 200)
         (launch-monitor-off-time (or test-stats-update-period 30))
         (launch-monitor-on-time-budget 30)
         (launch-monitor-period (+ launch-monitor-off-time launch-monitor-on-time-budget server-overloaded-budget))
         (remotehoststart-deadtime-default (+ server-start-allowance server-overloaded-budget 30))
         (remotehoststart-deadtime (or deadtime-trim remotehoststart-deadtime-default))
         (running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))
         (running-deadtime (or deadtime-trim running-deadtime-default))) ;; two minutes (30 seconds between updates, this leaves 3x grace period)

    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (let* ((dat (rmt:get-toplevels-and-incompletes run-id running-deadtime remotehoststart-deadtime)))
      (set! oldlaunched (list-ref dat 1))
      (set! toplevels   (list-ref dat 2))
      (set! incompleted (list-ref dat 0)))

    (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
		      (length toplevels) " old LAUNCHED toplevel tests and "
		      (length incompleted) " tests marked RUNNING but apparently dead.")
  
    ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
    ;;
    ;; (db:delay-if-busy dbdat)
    (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
	   (all-ids             (append min-incompleted-ids (map car oldlaunched))))
      (if (> (length all-ids) 0)
	  (begin
	    ;; (launch:is-test-alive "localhost" 435)
	    (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
			 " as DEAD")
	    (for-each
             (lambda (test-id)
               (let* ((tinfo   (rmt:get-test-info-by-id run-id test-id))
		      (run-dir (db:test-get-rundir     tinfo))
		      (host    (db:test-get-host       tinfo))
		      (pid     (db:test-get-process_id tinfo))
		      (result (rmt:get-status-from-final-status-file run-dir)))
		 (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
		     (begin
		       (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
		       (rmt:set-state-status-and-roll-up-items
			run-id test-id 'foo "COMPLETED" "PASS"
			"Test stopped responding but it has PASSED; marking it PASS in the DB."))
		     (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
					  (commonmod:is-test-alive host pid))))
		       (if is-alive
			   (debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
					" has a process on pid " pid ", NOT setting to DEAD.")
			   (begin
			     (debug:print 0 *default-log-port* "INFO: test " test-id
					  " final state/status is not COMPLETED/PASS. It is " result)
			     (rmt:set-state-status-and-roll-up-items
			      run-id test-id 'foo "COMPLETED" "DEAD"
			      "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
	     ;; call end of eud of run detection for posthook - from merge, is it needed?
	     ;; (launch:end-of-run-check run-id)
	     all-ids)
	    )))))

(define (mt:lazy-read-test-config test-name)
  (let ((tconf (hash-table-ref/default *testconfigs* test-name #f)))
    (if tconf
	tconf
	(let ((test-dirs (tests:get-tests-search-path *configdat*)))
	  (let loop ((hed (car test-dirs))
		     (tal (cdr test-dirs)))
	    ;; Setting MT_LINKTREE here is almost certainly unnecessary. 
	    (let ((tconfig-file (conc hed "/" test-name "/testconfig")))
	      (if (and (common:file-exists? tconfig-file)
		       (file-read-access? tconfig-file))
		  (let ((link-tree-path (common:get-linktree)) ;; (configf:lookup *configdat* "setup" "linktree"))
			(old-link-tree  (get-environment-variable "MT_LINKTREE")))
		    (if link-tree-path (setenv "MT_LINKTREE" link-tree-path))
		    (let ((newtcfg (read-config tconfig-file #f #f))) ;; NOTE: Does NOT run [system ...]
		      (hash-table-set! *testconfigs* test-name newtcfg)
		      (if old-link-tree 
			  (setenv "MT_LINKTREE" old-link-tree)
			  (unsetenv "MT_LINKTREE"))
		      newtcfg))
		  (if (null? tal)
		      (begin
			(debug:print-error 0 *default-log-port* "No readable testconfig found for " test-name)
			#f)
		      (loop (car tal)(cdr tal))))))))))