Megatest

Diff
Login

Differences From Artifact [011e6c6c39]:

To Artifact [b421549a4a]:


672
673
674
675
676
677
678
679
680
681


682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700

701
702






703
704
705
706
707
708
709

710
711
712





713
714
715
716
717
718
719










720
721
722
723
724
725
726
727









728
729
730
731
732
733
734
;;======================================================================

;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCED'));


(define (db:find-and-mark-incomplete db #!key (ovr-deadtime #f))
  (let* ((incompleted '())


	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
	 (deadtime     (if (and deadtime-str
				(string->number deadtime-str))
			   (string->number deadtime-str)
			   7200)) ;; two hours
	 (run-ids      (db:get-all-run-ids db))) ;; iterate over runs to divy up the calls
    (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
    (for-each
     (lambda (run-id)

       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; THIS CANNOT WORK. The run_duration is not updated in the central db due to performance concerns.
       ;;                   The testdat.db file must be consulted.
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
       ;;                     (db:test-get-run_duration testdat)))
       ;;                    600) 

       (sqlite3:for-each-row 
	(lambda (test-id)






	  (set! incompleted (cons test-id incompleted)))
	db
	"SELECT id FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time - run_duration) > ? AND state IN ('RUNNING','REMOTEHOSTSTART');"
	run-id deadtime)

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;

       (sqlite3:for-each-row
	(lambda (test-id)
	  (set! incompleted (cons test-id incompleted)))





	db
	"SELECT id FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time - run_duration) > ? AND state IN ('LAUNCHED');"
	run-id (* 60 60 24)))
     run-ids)
       
    ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
    ;;










    (if (> (length incompleted) 0)
	(begin
	  (debug:print 0 "WARNING: Marking test(s); " (string-intersperse (map conc incompleted) ", ") " as INCOMPLETE")
	  (sqlite3:execute 
	   db
	   (conc "UPDATE tests SET state='INCOMPLETE' WHERE id IN (" 
		 (string-intersperse (map conc incompleted) ",")
		 ");"))))))









		     
;; Clean out old junk and vacuum the database
;;
;; Ultimately do something like this:
;;
;; 1. Look at test records either deleted or part of deleted run:
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'







<


>
>



















>

|
>
>
>
>
>
>
|

|
|



>

|
<
>
>
>
>
>

|
|

|


>
>
>
>
>
>
>
>
>
>
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>







672
673
674
675
676
677
678

679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720

721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
;;======================================================================

;;  select end_time-now from
;;      (select testname,item_path,event_time+run_duration as
;;                          end_time,strftime('%s','now') as now from tests where state in
;;      ('RUNNING','REMOTEHOSTSTART','LAUNCED'));


(define (db:find-and-mark-incomplete db #!key (ovr-deadtime #f))
  (let* ((incompleted '())
	 (oldlaunched '())
	 (toplevels   '())
	 (deadtime-str (configf:lookup *configdat* "setup" "deadtime"))
	 (deadtime     (if (and deadtime-str
				(string->number deadtime-str))
			   (string->number deadtime-str)
			   7200)) ;; two hours
	 (run-ids      (db:get-all-run-ids db))) ;; iterate over runs to divy up the calls
    (if (number? ovr-deadtime)(set! deadtime ovr-deadtime))
    (for-each
     (lambda (run-id)

       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; THIS CANNOT WORK. The run_duration is not updated in the central db due to performance concerns.
       ;;                   The testdat.db file must be consulted.
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
       ;;                     (db:test-get-run_duration testdat)))
       ;;                    600) 
       (db:delay-if-busy)
       (sqlite3:for-each-row 
	(lambda (test-id run-dir uname testname item-path)
	  (if (and (equal? uname "n/a")
		   (equal? item-path "")) ;; this is a toplevel test
	      ;; what to do with toplevel? call rollup?
	      (begin
		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		(debug:print-info 0 "Found old toplevel test in RUNNING state, test-id=" test-id))
	      (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))
	db
	"SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 600 AND state IN ('RUNNING','REMOTEHOSTSTART');"
	run-id)

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;
       (db:delay-if-busy)
       (sqlite3:for-each-row
	(lambda (test-id run-dir uname testname item-path)

	  (if (and (equal? uname "n/a")
		   (equal? item-path "")) ;; this is a toplevel test
	      ;; what to do with toplevel? call rollup?
	      (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
	      (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched))))
	db
	"SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
	run-id))
     run-ids)
    
    ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
    ;;
    (db:delay-if-busy)
    (let* ((min-incompleted (filter (lambda (x)
				     (let* ((testpath (cadr x))
					    (tdatpath (conc testpath "/testdat.db"))
					    (dbexists (file-exists? tdatpath)))
				       (or (not dbexists) ;; if no file then something wrong - mark as incomplete
					   (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
				   incompleted))
	  (min-incompleted-ids (map car min-incompleted))
	  (all-ids             (append min-incompleted-ids (map car oldlaunched))))
      (if (> (length all-ids) 0)
	  (begin
	    (debug:print 0 "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as INCOMPLETE")
	    (sqlite3:execute 
	     db
	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE id IN (" 
		   (string-intersperse (map conc all-ids) ",")
		   ");")))))

    ;; Now do rollups for the toplevel tests
    ;;
    (for-each
     (lambda (toptest)
       (let ((test-name (list-ref toptest 3))
	     (run-id    (list-ref toptest 5)))
	 (cdb:top-test-set-per-pf-counts *runremote* run-id test-name)))
     toplevels)))
		     
;; Clean out old junk and vacuum the database
;;
;; Ultimately do something like this:
;;
;; 1. Look at test records either deleted or part of deleted run:
;;    a. If test dir exists, set the the test to state='UNKNOWN', Set the run to 'unknown'
1474
1475
1476
1477
1478
1479
1480


1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
;;
(define (db:get-count-tests-running dbstruct run-id)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (count)
       (set! res count))
     (db:get-db dbstruct run-id)


     "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;" 
     run-id) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');")
    res))

;; NEW BEHAVIOR: Look only at single run with run-id
;; 
;; (define (db:get-running-stats dbstruct run-id)
(define (db:get-count-tests-running-for-run-id dbstruct run-id)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (count)
       (set! res count))
     (db:get-db dbstruct run-id)
     "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;" run-id)
    res))

(define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup)
  (if (not jobgroup)
      0 ;; 
      (let ((res        0)
	    (testnames '()))







>
>











|

|







1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
;;
(define (db:get-count-tests-running dbstruct run-id)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (count)
       (set! res count))
     (db:get-db dbstruct run-id)
     ;; WARNING BUG EDIT ME - merged from v1.55 - not sure what is right here ...
     ;; "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id NOT IN (SELECT id FROM runs WHERE state='deleted') AND NOT (uname = 'n/a' AND item_path = '');")
     "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=?;" 
     run-id) ;; NOT IN (SELECT id FROM runs WHERE state='deleted');")
    res))

;; NEW BEHAVIOR: Look only at single run with run-id
;; 
;; (define (db:get-running-stats dbstruct run-id)
(define (db:get-count-tests-running-for-run-id dbstruct run-id)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (count)
       (set! res count))  ;; select * from tests where run_id=1 and uname = 'n/a' and item_path='';
     (db:get-db dbstruct run-id)
     "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '');" run-id)
    res))

(define (db:get-count-tests-running-in-jobgroup dbstruct run-id jobgroup)
  (if (not jobgroup)
      0 ;; 
      (let ((res        0)
	    (testnames '()))
1511
1512
1513
1514
1515
1516
1517



1518
1519
1520
1521
1522
1523
1524
	    (sqlite3:for-each-row
	     (lambda (count)
	       (set! res count))
	     (db:get-db dbstruct run-id)
	     (conc "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND testname in ('"
		   (string-intersperse testnames "','")
		   "');")))



	res)))

;; done with run when:
;;   0 tests in LAUNCHED, NOT_STARTED, REMOTEHOSTSTART, RUNNING
(define (db:estimated-tests-remaining dbstruct run-id)
  (let ((res 0))
    (sqlite3:for-each-row







>
>
>







1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
	    (sqlite3:for-each-row
	     (lambda (count)
	       (set! res count))
	     (db:get-db dbstruct run-id)
	     (conc "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND testname in ('"
		   (string-intersperse testnames "','")
		   "');")))
             ;; DEBUG FIXME - need to merge this v.155 query correctly   
             ;; AND testname in (SELECT testname FROM test_meta WHERE jobgroup=?)
             ;; AND NOT (uname = 'n/a' AND item_path = '');"
	res)))

;; done with run when:
;;   0 tests in LAUNCHED, NOT_STARTED, REMOTEHOSTSTART, RUNNING
(define (db:estimated-tests-remaining dbstruct run-id)
  (let ((res 0))
    (sqlite3:for-each-row
1777
1778
1779
1780
1781
1782
1783











1784
1785
1786
1787
1788
1789
1790
	 (tstsqry (conc "SELECT rundir FROM tests WHERE " testqry " AND state LIKE '" statepatt "' AND status LIKE '" statuspatt "' ORDER BY event_time ASC;")))
    (sqlite3:for-each-row 
     (lambda (p)
       (set! res (cons p res)))
     (db:get-db dbstruct run-id)
     tstsqry)
    res))












;;======================================================================
;; QUEUE UP META, TEST STATUS AND STEPS REMOTE ACCESS
;;======================================================================

;; NOTE: Can remove the regex and base64 encoding for zmq
(define (db:obj->string obj)







>
>
>
>
>
>
>
>
>
>
>







1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
	 (tstsqry (conc "SELECT rundir FROM tests WHERE " testqry " AND state LIKE '" statepatt "' AND status LIKE '" statuspatt "' ORDER BY event_time ASC;")))
    (sqlite3:for-each-row 
     (lambda (p)
       (set! res (cons p res)))
     (db:get-db dbstruct run-id)
     tstsqry)
    res))

(define (db:test-toplevel-num-items db run-id testname)
  (let ((res 0))
    (sqlite3:for-each-row
     (lambda (num-items)
       (set! res num-items))
     db
     "SELECT count(id) FROM tests WHERE run_id=? AND testname=? AND item_path != '' AND state NOT IN ('DELETED');"
     run-id
     testname)
    res))

;;======================================================================
;; QUEUE UP META, TEST STATUS AND STEPS REMOTE ACCESS
;;======================================================================

;; NOTE: Can remove the regex and base64 encoding for zmq
(define (db:obj->string obj)
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045



2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
;; 						    (thread-sleep! sleep-time)
;; 						    (debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up")
;; 						    (proc (- remtries 1)))))
;; 					       (apply sqlite3:execute db query params))
;; 					      (debug:print 0 "ERROR: too many attempts to access db were made and no sucess. query: "
;; 							   query ", params: " params))))
;; 			     (proc remtries))
(define (db:delay-if-busy #!key (count 5))
  (let ((dbfj (conc *toppath* "/megatest.db-journal")))
    (if (file-exists? dbfj)
	(case count



	  ((5)
	   (thread-sleep! 0.1)
	   (db:delay-if-busy count: 4))
	  ((4)
	   (thread-sleep! 0.4)
	   (db:delay-if-busy count: 3))
	  ((3)
	   (thread-sleep! 1.0)
	   (db:delay-if-busy count: 2))
	  ((2)
	   (thread-sleep! 2.0)
	   (db:delay-if-busy count: 1))
	  ((1)
	   (thread-sleep! 5.0)
	   (db:delay-if-busy count: 0))
	  (else
	   (debug:print-info 0 "delaying db access due to high database load.")
	   (thread-sleep! 10))))))
;; (db:delay-if-busy)
;; (apply sqlite3:execute db query params)))
;; (db:delay-if-busy)

(define (db:test-get-records-for-index-file dbstruct run-id test-name)
  (let ((res '()))
    (sqlite3:for-each-row 







|



>
>
>

|


|


|


|


|



|







2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
;; 						    (thread-sleep! sleep-time)
;; 						    (debug:print-info 0 "trying db call one more time....this may never recover, if necessary kill process " (current-process-id) " on host " (get-host-name) " to clean up")
;; 						    (proc (- remtries 1)))))
;; 					       (apply sqlite3:execute db query params))
;; 					      (debug:print 0 "ERROR: too many attempts to access db were made and no sucess. query: "
;; 							   query ", params: " params))))
;; 			     (proc remtries))
(define (db:delay-if-busy #!key (count 6))
  (let ((dbfj (conc *toppath* "/megatest.db-journal")))
    (if (file-exists? dbfj)
	(case count
	  ((6)
	   (thread-sleep! 0.2)
	   (db:delay-if-busy count: 5))
	  ((5)
	   (thread-sleep! 0.4)
	   (db:delay-if-busy count: 4))
	  ((4)
	   (thread-sleep! 0.8)
	   (db:delay-if-busy count: 3))
	  ((3)
	   (thread-sleep! 1.6)
	   (db:delay-if-busy count: 2))
	  ((2)
	   (thread-sleep! 3.2)
	   (db:delay-if-busy count: 1))
	  ((1)
	   (thread-sleep! 6.4)
	   (db:delay-if-busy count: 0))
	  (else
	   (debug:print-info 0 "delaying db access due to high database load.")
	   (thread-sleep! 12.8))))))
;; (db:delay-if-busy)
;; (apply sqlite3:execute db query params)))
;; (db:delay-if-busy)

(define (db:test-get-records-for-index-file dbstruct run-id test-name)
  (let ((res '()))
    (sqlite3:for-each-row