Megatest

Check-in [426aad0a60]
Login
Overview
Comment:Meld'd more of testdat
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | v1.65
Files: files | file ages | folders
SHA1: 426aad0a6040f0246b842104ec9089555ef5a38c
User & Date: matt on 2020-10-24 23:23:46
Other Links: branch diff | manifest | tags
Context
2020-10-25
22:54
Removed gratuitious call to get-count-tests-running-for-run-id and some indenting fixes. check-in: 8085b19fd6 user: matt tags: v1.65
2020-10-24
23:23
Meld'd more of testdat check-in: 426aad0a60 user: matt tags: v1.65
2020-10-23
23:05
merge-sync'd with test-rundat2 in prep for merging rundat changes. check-in: e0ddc9d3c5 user: matt tags: v1.65
Changes

Modified db.scm from [d7c10c5a58] to [7fd742bffc].

1774
1775
1776
1777
1778
1779
1780























1781
1782
1783
1784
1785
1786
1787
	  (debug:print 0 *default-log-port* "ERROR: cannot read " infile)
          (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir)
          #f
          )
        (with-input-from-file infile read-lines)
	)))
























;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));

(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
	  (debug:print 0 *default-log-port* "ERROR: cannot read " infile)
          (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir)
          #f
          )
        (with-input-from-file infile read-lines)
	)))

;; check duration against test-run.dat file if it exists and update the value in
;; the db if necessary
;;
(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
  (let* ((datf             (conc run-dir ".mt_data/test-run.dat"))
	 (modt             (if (and (file-exists? datf)
				    (file-read-access? datf))
			       (file-modification-time datf)
			       #f)) ;; (+ event-time run-duration))))
	 (alt-run-duration (if modt
			       (- modt event-time)
			       #f)))
    (if (and alt-run-duration
	     (> alt-run-duration run-duration))
	(begin
	  (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
	  (db:with-db
	   dbstruct #f #f
	   (lambda (db)
	     (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
	     #t)))
	#f))) ;; #f = we did NOT adjust the time
	      
;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));

(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
  (let* ((incompleted '())
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
       
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))







|







1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
	    
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))

Modified launch.scm from [fef9dd9e8e] to [2c7bba3a8e].

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (when do-sync
          ;;(with-output-to-file (conc (getenv "MT_TEST_RUN_DIR") "/last-loadinfo.log" #:append)
          ;;  (lambda () (pp (list (current-seconds) new-cpu-load new-disk-free (calc-minutes)))))
          (common:telemetry-log "zombie" (conc  "launch:monitor-job - dosync started at "(current-seconds)))
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (common:telemetry-log "zombie" (conc "launch:monitor-job - dosync finished at "(current-seconds))))
        
	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
	      ;;       between tries?







|
<
<
<
|
<
<







249
250
251
252
253
254
255
256



257


258
259
260
261
262
263
264
          (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)
          (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.")
          ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING
          (set! kill-job? #f)))

        (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync)
        (launch:handle-zombie-tests run-id)
        (if do-sync ;; save meta data about the running of this test



	    (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f))


	(if kill-job? 
	    (begin
              (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason)
	      (mutex-lock! m)
	      ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this
	      ;;       section and the runit section? Or add a loop that tries three times with a 1/4 second
	      ;;       between tries?

Modified tests.scm from [1377ed8d3a] to [cd66dd6c01].

1970
1971
1972
1973
1974
1975
1976

1977




















1978


1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991

1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
	 (lambda (count)
	   (set! res count))
	 tdb
	 "SELECT count(id) FROM test_rundat;")
	res))
  0)


(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)




















  (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))


  (if (and cpuload diskfree)
      (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
  (if minutes 
      (rmt:general-call 'update-run-duration run-id minutes test-id))
  (if (and uname hostname)
      (rmt:general-call 'update-uname-host run-id uname hostname test-id)))
  
;; This one is for running with no db access (i.e. via rmt: internally)
(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries)
;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
;;  (let ((remtries 10))
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))

	 (uname    (get-uname "-srvpio"))
	 (hostname (get-host-name)))
    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname)))
    
;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))
	 (remtries 10))
    (handle-exceptions







>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
|
|
|
|
|
|


|




>


|







1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
	 (lambda (count)
	   (set! res count))
	 tdb
	 "SELECT count(id) FROM test_rundat;")
	res))
  0)

;; 
(define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f))
  (if (get-environment-variable "MT_TEST_RUN_DIR")
      (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data"))
	     (or-dash  (lambda (instr)
			 (cond
			  ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken
			  ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr))
			  (else instr))))
	     (file-new (not (directory-exists? dest-dir))))
	(if file-new (create-directory dest-dir #t))
	(let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append)))
	  (with-output-to-port outp
	    (lambda ()
	      (if file-new
		  (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname"))
	      (print (current-seconds) "," (or-dash run-id)   "," (or-dash test-id)  ","
		     (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree)  ","
		     (or-dash minutes) "," (or-dash hostname) ","
		     (or-dash uname)))) ;; put uname last as it has spaces in it
	  (close-output-port outp)))
      (begin
	(rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1))))
  (if update-db
      (begin
	(if (and cpuload diskfree)
	    (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id))
	(if minutes 
	    (rmt:general-call 'update-run-duration run-id minutes test-id))
	(if (and uname hostname)
	    (rmt:general-call 'update-uname-host run-id uname hostname test-id)))))
  
;; This one is for running with no db access (i.e. via rmt: internally)
(define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f))
;; (define (tests:set-full-meta-info test-id run-id minutes work-area)
;;  (let ((remtries 10))
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))
	 (tmpfree  (get-df "/tmp"))
	 (uname    (get-uname "-srvpio"))
	 (hostname (get-host-name)))
    (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree)))
    
;; (define (tests:set-partial-meta-info test-id run-id minutes work-area)
#;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries)
  (let* ((cpuload  (get-cpu-load))
	 (diskfree (get-df (current-directory)))
	 (remtries 10))
    (handle-exceptions