Megatest

Diff
Login

Differences From Artifact [f30ded65d6]:

To Artifact [b17ec00910]:


169
170
171
172
173
174
175










176
177
178
179
180
181
182
183
184
185
186
187








188
189
190
191
192


193
194
195
196
197
198
199
200
201
202
203
204
205
206

























207
208
209
210
211
212
213
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190







191
192
193
194
195
196
197
198
199
200
201
202
203
204
205














206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237







+
+
+
+
+
+
+
+
+
+





-
-
-
-
-
-
-
+
+
+
+
+
+
+
+





+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







;; 	(mutex-lock! *rundb-mutex*)
;; 	(if (eq? mod-read 'mod)
;; 	    (dbr:dbstruct-mtime-set! dbstruct (current-milliseconds))
;; 	    (dbr:dbstruct-rtime-set! dbstruct (current-milliseconds)))
;; 	(dbr:dbstruct-inuse-set! dbstruct #f)
;; 	(mutex-unlock! *rundb-mutex*))))

(define-inline (db:generic-error-printout exn . message)
  (print-call-chain (current-error-port))
  (apply debug:print-error 0 *default-log-port* message)
  (debug:print-error 0 *default-log-port* "   params: " params
		     ", error: "     ((condition-property-accessor 'exn 'message)   exn)
		     ", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
		     ", location: "  ((condition-property-accessor 'exn 'location)  exn)
		     ))

(print-call-chain (current-error-port))
;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;")
;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no
;;
(define (db:with-db dbstruct run-id r/w proc . params)
  (let* ((have-struct (dbr:dbstruct? dbstruct))
         (dbdat (if have-struct 
                    (db:get-db dbstruct)
                    #f))
	 (db    (if have-struct
		    (db:dbdat-get-db dbdat)
		    dbstruct))
	 (use-mutex (> *api-process-request-count* 25)))
         (dbdat     (if have-struct 
			(db:get-db dbstruct)
			#f))
	 (db        (if have-struct
			(db:dbdat-get-db dbdat)
			dbstruct))
	 (fname     (db:dbdat-get-path dbdat))
	 (use-mutex (> *api-process-request-count* 25))) ;; was 25
    (if (and use-mutex
	     (common:low-noise-print 120 "over-50-parallel-api-requests"))
	(debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access"))
    (if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*))
	(debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*))
    (condition-case
     (begin
    (handle-exceptions
     exn
     (begin
       (print-call-chain (current-error-port))
       (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn))
       ;; there is no recovering at this time. exit
       (exit 50))
     (if use-mutex (mutex-lock! *db-with-db-mutex*))
     (let ((res (apply proc db params)))
       (if use-mutex (mutex-unlock! *db-with-db-mutex*))
       ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
       (if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat))
       res))))

    ;;;;;;;;; (handle-exceptions
    ;;;;;;;;;  exn
    ;;;;;;;;;  (begin
    ;;;;;;;;;    (print-call-chain (current-error-port))
    ;;;;;;;;;    (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn))
    ;;;;;;;;;    ;; there is no recovering at this time. exit
    ;;;;;;;;;    (exit 50))
       (if use-mutex (mutex-lock! *db-with-db-mutex*))
       (let ((res (apply proc db params)))
	 (if use-mutex (mutex-unlock! *db-with-db-mutex*))
	 ;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
	 (if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat))
	 res))
     (exn (io-error)
	  (db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))
     (exn (corrupt)
	  (db:generic-error-printout exn "ERROR: database " fname " is corrupt. Repair it to proceed."))
     (exn (busy)
	  (db:generic-error-printout exn "ERROR: database " fname
				     " is locked. Try copying to another location, remove original and copy back."))
     (exn (permission)(db:generic-error-printout exn "ERROR: database " fname " has some permissions problem."))
     (exn ()
	  (db:generic-error-printout exn "ERROR: Unknown error with database " fname " message: "
		       ((condition-property-accessor 'exn 'message) exn))))))
      
;;======================================================================
;; K E E P   F I L E D B   I N   dbstruct
;;======================================================================

;; (define (db:get-filedb dbstruct run-id)
;;   (let ((db (vector-ref dbstruct 2)))
;;     (if db
1797
1798
1799
1800
1801
1802
1803















1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826























1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838













1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853















1854
1855
1856
1857


1858
1859



1860
1861

1862
1863

1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877








1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890





1891
1892


1893
1894
1895
1896
1897
1898







1899
1900
1901
1902



1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842























1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865












1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878















1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894



1895
1896
1897

1898
1899
1900
1901

1902
1903

1904














1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922



1923
1924
1925
1926
1927
1928

1929
1930
1931





1932
1933
1934
1935
1936
1937
1938
1939



1940
1941
1942




















1943
1944
1945
1946
1947
1948
1949







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
-
-
+
+

-
+
+
+

-
+

-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+










-
-
-
+
+
+
+
+

-
+
+

-
-
-
-
-
+
+
+
+
+
+
+

-
-
-
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-







         )
    (debug:print-info 4  *default-log-port* "running-deadtime = " running-deadtime)
    (debug:print-info 4  *default-log-port* "deadtime-trim = " deadtime-trim)

    (db:with-db 
     dbstruct #f #f
     (lambda (db)
       (let* ((stmth1 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('RUNNING');"))
	      (stmth2 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests 
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
                                          AND state IN ('REMOTEHOSTSTART');"))
	      (stmth3 (db:get-cache-stmth
		       dbstruct db
		       "SELECT id,rundir,uname,testname,item_path FROM tests
                           WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
                                          AND state IN ('LAUNCHED');")))
       ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
       ;;
       ;; HOWEVER: this code in run:test seems to work fine
       ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
       ;;                     (db:test-get-run_duration testdat)))
       ;;                    600) 
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
                (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
        db
        
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
        run-id running-deadtime) ;; default time 720 seconds

	 ;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
	 ;;
	 ;; HOWEVER: this code in run:test seems to work fine
	 ;;              (> (- (current-seconds)(+ (db:test-get-event_time testdat)
	 ;;                     (db:test-get-run_duration testdat)))
	 ;;                    600) 
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
		  (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
				    test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
				    " event-time="event-time" run-duration="run-duration))))
	  stmth1
	  run-id running-deadtime) ;; default time 720 seconds
       
       
       (sqlite3:for-each-row 
        (lambda (test-id run-dir uname testname item-path event-time run-duration)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (begin
                (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
                (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)
                (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
	 (sqlite3:for-each-row 
	  (lambda (test-id run-dir uname testname item-path event-time run-duration)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(begin
		  (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		  (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
				    " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
				    " run-duration="run-duration)
		  (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
        db
        "SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
        run-id remotehoststart-deadtime) ;; default time 230 seconds

       ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
       ;;
       ;; (db:delay-if-busy dbdat)
       (sqlite3:for-each-row
        (lambda (test-id run-dir uname testname item-path)
          (if (and (equal? uname "n/a")
                   (equal? item-path "")) ;; this is a toplevel test
              ;; what to do with toplevel? call rollup?
              (set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
              (begin
                (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked")
	  stmth2
	  run-id remotehoststart-deadtime) ;; default time 230 seconds
	 
	 ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
	 ;;
	 ;; (db:delay-if-busy dbdat)
	 (sqlite3:for-each-row
	  (lambda (test-id run-dir uname testname item-path)
	    (if (and (equal? uname "n/a")
		     (equal? item-path "")) ;; this is a toplevel test
		;; what to do with toplevel? call rollup?
		(set! toplevels   (cons (list test-id run-dir uname testname item-path run-id) toplevels))
		(begin
		  (debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
				    " 1 day since event_time marked")
                (set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
        db
        "SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
        run-id)
	  stmth3
	  run-id)
       
       (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
	 (debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
			   (length toplevels) " old LAUNCHED toplevel tests and "
			   (length incompleted) " tests marked RUNNING but apparently dead."))

       ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
	 ;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
       ;;
       ;; (db:delay-if-busy dbdat)
	 ;; (db:delay-if-busy dbdat)
       (let* (;; (min-incompleted (filter (lambda (x)
              ;;      		      (let* ((testpath (cadr x))
              ;;      			     (tdatpath (conc testpath "/testdat.db"))
              ;;      			     (dbexists (common:file-exists? tdatpath)))
              ;;      			(or (not dbexists) ;; if no file then something wrong - mark as incomplete
              ;;      			    (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
              ;;      		    incompleted))
              (min-incompleted-ids (map car incompleted)) ;; do 'em all
              (all-ids             (append min-incompleted-ids (map car oldlaunched))))
         (if (> (length all-ids) 0)
             (begin
	       ;; (launch:is-test-alive "localhost" 435)
               (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")
               (for-each
	 (let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
		(all-ids             (append min-incompleted-ids (map car oldlaunched))))
	   (if (> (length all-ids) 0)
	       (begin
		 ;; (launch:is-test-alive "localhost" 435)
		 (debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
			      " as DEAD")
		 (for-each
                  (lambda (test-id)
                    (let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
			   (tinfo   (db:get-test-info-by-id dbstruct run-id test-id))
			   (run-dir (db:test-get-rundir     tinfo))
			   (host    (db:test-get-host       tinfo))
			   (pid     (db:test-get-process_id tinfo))
			   (result (db:get-status-from-final-status-file run-dir)))
		      (if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result))) 
			  (begin
			    (debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
			    (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS"
								   "Test stopped responding but it has PASSED; marking it PASS in the DB."))
			  (let ((is-alive (launch:is-test-alive host pid)))
			    (db:set-state-status-and-roll-up-items
			     dbstruct run-id test-id 'foo "COMPLETED" "PASS"
			     "Test stopped responding but it has PASSED; marking it PASS in the DB."))
			  (let ((is-alive (and (not (eq? pid 0))  ;; 0 is default in re-used field "attemptnum" where pid stored.
					       (launch:is-test-alive host pid))))
			    (if is-alive
				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host " has a process on pid " pid ", NOT setting to DEAD.")
				(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
					     " has a process on pid " pid ", NOT setting to DEAD.")
				(begin
				  (debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result)
				  (db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
									 "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
                     ;; call end of eud of run detection for posthook - from merge, is it needed?
                     ;; (launch:end-of-run-check run-id)
				  (debug:print 0 *default-log-port* "INFO: test " test-id
					       " final state/status is not COMPLETED/PASS. It is " result)
				  (db:set-state-status-and-roll-up-items
				   dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
				   "Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
		  ;; call end of eud of run detection for posthook - from merge, is it needed?
		  ;; (launch:end-of-run-check run-id)
		  all-ids)
	       ;;call end of eud of run detection for posthook
	       (launch:end-of-run-check run-id)
	       )))))))
		 ;;call end of eud of run detection for posthook
		 (launch:end-of-run-check run-id)
		 )))))))


;; ALL REPLACED BY THE BLOCK ABOVE
;;
;; 	    (sqlite3:execute 
;; 	     db
;; 	     (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN (" 
;; 		   (string-intersperse (map conc all-ids) ",")
;; 		   ");")
;;              run-id))))
;; 
;;     ;; Now do rollups for the toplevel tests
;;     ;;
;;     ;; (db:delay-if-busy dbdat)
;;     (for-each
;;      (lambda (toptest)
;;        (let ((test-name (list-ref toptest 3)))
;; ;;	     (run-id    (list-ref toptest 5)))
;; 	 (db:top-test-set-per-pf-counts dbstruct run-id test-name)))
;;      toplevels)))

;; BUG: Probably broken - does not explicitly use run-id in the query
;;
(define (db:top-test-set-per-pf-counts dbstruct run-id test-name)
  (db:general-call dbstruct 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name)))

;; Clean out old junk and vacuum the database
3253
3254
3255
3256
3257
3258
3259
3260

3261
3262
3263
3264
3265
3266
3267
3273
3274
3275
3276
3277
3278
3279

3280
3281
3282
3283
3284
3285
3286
3287







-
+







     #f
     (lambda (db)
       (let* ((stmth (db:get-cache-stmth dbstruct db qry)))
	 (sqlite3:first-result stmth run-id))))))

;; For a given testname how many items are running? Used to determine
;; probability for regenerating html
;; 
;;
(define (db:get-count-tests-running-for-testname dbstruct run-id testname)
  (db:with-db
   dbstruct
   run-id
   #f
   (lambda (db)
     (let* ((stmt "SELECT count(id) FROM tests WHERE state in ('RUNNING','LAUNCHED','REMOTEHOSTSTART') AND run_id=? AND NOT (uname = 'n/a' AND item_path = '') AND testname=?;")