169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
|
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
;; (mutex-lock! *rundb-mutex*)
;; (if (eq? mod-read 'mod)
;; (dbr:dbstruct-mtime-set! dbstruct (current-milliseconds))
;; (dbr:dbstruct-rtime-set! dbstruct (current-milliseconds)))
;; (dbr:dbstruct-inuse-set! dbstruct #f)
;; (mutex-unlock! *rundb-mutex*))))
(define-inline (db:generic-error-printout exn . message)
(print-call-chain (current-error-port))
(apply debug:print-error 0 *default-log-port* message)
(debug:print-error 0 *default-log-port* " params: " params
", error: " ((condition-property-accessor 'exn 'message) exn)
", arguments: " ((condition-property-accessor 'exn 'arguments) exn)
", location: " ((condition-property-accessor 'exn 'location) exn)
))
(print-call-chain (current-error-port))
;; (db:with-db dbstruct run-id sqlite3:exec "select blah fgrom blaz;")
;; r/w is a flag to indicate if the db is modified by this query #t = yes, #f = no
;;
(define (db:with-db dbstruct run-id r/w proc . params)
(let* ((have-struct (dbr:dbstruct? dbstruct))
(dbdat (if have-struct
(db:get-db dbstruct)
#f))
(db (if have-struct
(db:dbdat-get-db dbdat)
dbstruct))
(use-mutex (> *api-process-request-count* 25)))
(dbdat (if have-struct
(db:get-db dbstruct)
#f))
(db (if have-struct
(db:dbdat-get-db dbdat)
dbstruct))
(fname (db:dbdat-get-path dbdat))
(use-mutex (> *api-process-request-count* 25))) ;; was 25
(if (and use-mutex
(common:low-noise-print 120 "over-50-parallel-api-requests"))
(debug:print-info 0 *default-log-port* *api-process-request-count* " parallel api requests being processed in process " (current-process-id) ", throttling access"))
(if (common:low-noise-print 600 (conc "parallel-api-requests" *max-api-process-requests*))
(debug:print-info 2 *default-log-port* "Parallel api request count: " *api-process-request-count* " max parallel requests: " *max-api-process-requests*))
(condition-case
(begin
(handle-exceptions
exn
(begin
(print-call-chain (current-error-port))
(debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn))
;; there is no recovering at this time. exit
(exit 50))
(if use-mutex (mutex-lock! *db-with-db-mutex*))
(let ((res (apply proc db params)))
(if use-mutex (mutex-unlock! *db-with-db-mutex*))
;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
(if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat))
res))))
;;;;;;;;; (handle-exceptions
;;;;;;;;; exn
;;;;;;;;; (begin
;;;;;;;;; (print-call-chain (current-error-port))
;;;;;;;;; (debug:print-error 0 *default-log-port* "sqlite3 issue in db:with-db, dbstruct=" dbstruct ", run-id=" run-id ", proc=" proc ", params=" params " error: " ((condition-property-accessor 'exn 'message) exn))
;;;;;;;;; ;; there is no recovering at this time. exit
;;;;;;;;; (exit 50))
(if use-mutex (mutex-lock! *db-with-db-mutex*))
(let ((res (apply proc db params)))
(if use-mutex (mutex-unlock! *db-with-db-mutex*))
;; (if (vector? dbstruct)(db:done-with dbstruct run-id r/w))
(if dbdat (stack-push! (dbr:dbstruct-dbstack dbstruct) dbdat))
res))
(exn (io-error)
(db:generic-error-printout exn "ERROR: i/o error with " fname ". Check permissions, disk space etc. and try again."))
(exn (corrupt)
(db:generic-error-printout exn "ERROR: database " fname " is corrupt. Repair it to proceed."))
(exn (busy)
(db:generic-error-printout exn "ERROR: database " fname
" is locked. Try copying to another location, remove original and copy back."))
(exn (permission)(db:generic-error-printout exn "ERROR: database " fname " has some permissions problem."))
(exn ()
(db:generic-error-printout exn "ERROR: Unknown error with database " fname " message: "
((condition-property-accessor 'exn 'message) exn))))))
;;======================================================================
;; K E E P F I L E D B I N dbstruct
;;======================================================================
;; (define (db:get-filedb dbstruct run-id)
;; (let ((db (vector-ref dbstruct 2)))
;; (if db
|
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
|
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
|
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
-
+
+
+
-
+
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
+
-
+
+
-
-
-
-
-
+
+
+
+
+
+
+
-
-
-
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|
)
(debug:print-info 4 *default-log-port* "running-deadtime = " running-deadtime)
(debug:print-info 4 *default-log-port* "deadtime-trim = " deadtime-trim)
(db:with-db
dbstruct #f #f
(lambda (db)
(let* ((stmth1 (db:get-cache-stmth
dbstruct db
"SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests
WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
AND state IN ('RUNNING');"))
(stmth2 (db:get-cache-stmth
dbstruct db
"SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests
WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?)
AND state IN ('REMOTEHOSTSTART');"))
(stmth3 (db:get-cache-stmth
dbstruct db
"SELECT id,rundir,uname,testname,item_path FROM tests
WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400
AND state IN ('LAUNCHED');")))
;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
;;
;; HOWEVER: this code in run:test seems to work fine
;; (> (- (current-seconds)(+ (db:test-get-event_time testdat)
;; (db:test-get-run_duration testdat)))
;; 600)
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
(debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration))))
db
"SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('RUNNING');"
run-id running-deadtime) ;; default time 720 seconds
;; in RUNNING or REMOTEHOSTSTART for more than 10 minutes
;;
;; HOWEVER: this code in run:test seems to work fine
;; (> (- (current-seconds)(+ (db:test-get-event_time testdat)
;; (db:test-get-run_duration testdat)))
;; 600)
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
(debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
" event-time="event-time" run-duration="run-duration))))
stmth1
run-id running-deadtime) ;; default time 720 seconds
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time" run-duration="run-duration)
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
" run-duration="run-duration)
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
db
"SELECT id,rundir,uname,testname,item_path,event_time,run_duration FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > (run_duration + ?) AND state IN ('REMOTEHOSTSTART');"
run-id remotehoststart-deadtime) ;; default time 230 seconds
;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
;;
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(begin
(debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id" 1 day since event_time marked")
stmth2
run-id remotehoststart-deadtime) ;; default time 230 seconds
;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
;;
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path)
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(begin
(debug:print-info 0 *default-log-port* "Found old test in LAUNCHED state, test-id=" test-id
" 1 day since event_time marked")
(set! oldlaunched (cons (list test-id run-dir uname testname item-path run-id) oldlaunched)))))
db
"SELECT id,rundir,uname,testname,item_path FROM tests WHERE run_id=? AND (strftime('%s','now') - event_time) > 86400 AND state IN ('LAUNCHED');"
run-id)
stmth3
run-id)
(debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, " (length toplevels) " old LAUNCHED toplevel tests and " (length incompleted) " tests marked RUNNING but apparently dead.")
(debug:print-info 18 *default-log-port* "Found " (length oldlaunched) " old LAUNCHED items, "
(length toplevels) " old LAUNCHED toplevel tests and "
(length incompleted) " tests marked RUNNING but apparently dead."))
;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
;; These are defunct tests, do not do all the overhead of set-state-status. Force them to INCOMPLETE.
;;
;; (db:delay-if-busy dbdat)
;; (db:delay-if-busy dbdat)
(let* (;; (min-incompleted (filter (lambda (x)
;; (let* ((testpath (cadr x))
;; (tdatpath (conc testpath "/testdat.db"))
;; (dbexists (common:file-exists? tdatpath)))
;; (or (not dbexists) ;; if no file then something wrong - mark as incomplete
;; (> (- (current-seconds)(file-modification-time tdatpath)) 600)))) ;; no change in 10 minutes to testdat.db - she's dead Jim
;; incompleted))
(min-incompleted-ids (map car incompleted)) ;; do 'em all
(all-ids (append min-incompleted-ids (map car oldlaunched))))
(if (> (length all-ids) 0)
(begin
;; (launch:is-test-alive "localhost" 435)
(debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ") " as DEAD")
(for-each
(let* ((min-incompleted-ids (map car incompleted)) ;; do 'em all
(all-ids (append min-incompleted-ids (map car oldlaunched))))
(if (> (length all-ids) 0)
(begin
;; (launch:is-test-alive "localhost" 435)
(debug:print 0 *default-log-port* "WARNING: Marking test(s); " (string-intersperse (map conc all-ids) ", ")
" as DEAD")
(for-each
(lambda (test-id)
(let* (;; (run-dir (db:test-get-rundir-from-test-id dbstruct run-id test-id))
(tinfo (db:get-test-info-by-id dbstruct run-id test-id))
(run-dir (db:test-get-rundir tinfo))
(host (db:test-get-host tinfo))
(pid (db:test-get-process_id tinfo))
(result (db:get-status-from-final-status-file run-dir)))
(if (and (list? result) (> (length result) 1) (equal? "PASS" (cadr result)) (equal? "COMPLETED" (car result)))
(begin
(debug:print 0 *default-log-port* "INFO: test " test-id " actually passed, so marking PASS not DEAD")
(db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "PASS"
"Test stopped responding but it has PASSED; marking it PASS in the DB."))
(let ((is-alive (launch:is-test-alive host pid)))
(db:set-state-status-and-roll-up-items
dbstruct run-id test-id 'foo "COMPLETED" "PASS"
"Test stopped responding but it has PASSED; marking it PASS in the DB."))
(let ((is-alive (and (not (eq? pid 0)) ;; 0 is default in re-used field "attemptnum" where pid stored.
(launch:is-test-alive host pid))))
(if is-alive
(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host " has a process on pid " pid ", NOT setting to DEAD.")
(debug:print 0 *default-log-port* "INFO: test " test-id " on host " host
" has a process on pid " pid ", NOT setting to DEAD.")
(begin
(debug:print 0 *default-log-port* "INFO: test " test-id " final state/status is not COMPLETED/PASS. It is " result)
(db:set-state-status-and-roll-up-items dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
"Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
;; call end of eud of run detection for posthook - from merge, is it needed?
;; (launch:end-of-run-check run-id)
(debug:print 0 *default-log-port* "INFO: test " test-id
" final state/status is not COMPLETED/PASS. It is " result)
(db:set-state-status-and-roll-up-items
dbstruct run-id test-id 'foo "COMPLETED" "DEAD"
"Test stopped responding while in RUNNING or REMOTEHOSTSTART; presumed dead.")))))))
;; call end of eud of run detection for posthook - from merge, is it needed?
;; (launch:end-of-run-check run-id)
all-ids)
;;call end of eud of run detection for posthook
(launch:end-of-run-check run-id)
)))))))
;;call end of eud of run detection for posthook
(launch:end-of-run-check run-id)
)))))))
;; ALL REPLACED BY THE BLOCK ABOVE
;;
;; (sqlite3:execute
;; db
;; (conc "UPDATE tests SET state='INCOMPLETE' WHERE run_id=? AND id IN ("
;; (string-intersperse (map conc all-ids) ",")
;; ");")
;; run-id))))
;;
;; ;; Now do rollups for the toplevel tests
;; ;;
;; ;; (db:delay-if-busy dbdat)
;; (for-each
;; (lambda (toptest)
;; (let ((test-name (list-ref toptest 3)))
;; ;; (run-id (list-ref toptest 5)))
;; (db:top-test-set-per-pf-counts dbstruct run-id test-name)))
;; toplevels)))
;; BUG: Probably broken - does not explicitly use run-id in the query
;;
(define (db:top-test-set-per-pf-counts dbstruct run-id test-name)
(db:general-call dbstruct 'top-test-set-per-pf-counts (list test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name test-name)))
;; Clean out old junk and vacuum the database
|