︙ | | |
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
|
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
|
-
+
|
(if (and (> file-age (* (string->number (or (configf:lookup *configdat* "setup" "log-expire-days") "30")) 24 3600))
(file-exists? fullname)) ;; just in case it was gzipped - will get it next time
(handle-exceptions
exn
#f
(if (directory? fullname)
(begin
(debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
(debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
(inc-stat "directories"))
(begin
(delete-file* fullname)
(inc-stat "deleted")))
(hash-table-delete! all-files file)))))))
'()
"logs")
|
︙ | | |
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
|
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
|
-
+
|
(lambda (a b)
(< (hash-table-ref all-files a)(hash-table-ref all-files b))))
(- num-logs max-allowed))))
(for-each
(lambda (file)
(let* ((fullname (conc "logs/" file)))
(if (directory? fullname)
(debug:print-error 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
(debug:print-info 0 *default-log-port* fullname " in logs directory is a directory! Cannot rotate it, it is best to not put subdirectories in the logs dir.")
(handle-exceptions
exn
(debug:print-error 0 *default-log-port* "failed to remove " fullname ", exn=" exn)
(delete-file* fullname)))))
files)
(debug:print-info 0 *default-log-port* "Deleted " (length files) " files from logs, keeping " max-allowed " files."))))))
|
︙ | | |
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
|
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
|
-
+
+
+
+
-
-
-
+
+
+
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
-
+
-
-
-
-
+
+
+
-
+
+
|
;; calculate a delay number based on a droop curve
;; inputs are:
;; - load-in, load as from uptime, NOT normalized
;; - numcpus, number of cpus, ideally use the real cpus, not threads
;;
(define (common:get-delay load-in numcpus)
(let* ((ratio (/ load-in numcpus))
(new-option (configf:lookup *configdat* "load" "new-load-method")))
(new-option (configf:lookup *configdat* "load" "new-load-method"))
(paramstr (or (configf:lookup *configdat* "load" "exp-params")
"15 12 1281453987.9543 0.75")) ;; 5 4 10 1"))
(paramlst (map string->number (string-split paramstr))))
(if new-option
(begin
(cond ((and (>= ratio 0) (< ratio .5))
0
(begin
(cond ((and (>= ratio 0) (< ratio .5))
0)
)
((and (>= ratio 0.5) (<= ratio .9))
(* ratio (/ 5 .9)))
((and (> ratio .9) (<= ratio 1.1))
(+ 5 (* (- ratio .9) (/ 55 .2))))
((> ratio 1.1)
60)))
(max (/ (expt 5 (* 4 ratio)) 10) 0))))
((and (>= ratio 0.5) (<= ratio .9))
(* ratio (/ 5 .9)))
((and (> ratio .9) (<= ratio 1.1))
(+ 5 (* (- ratio .9) (/ 55 .2))))
((> ratio 1.1)
60)))
(match paramlst
((r1 r2 s1 s2)
(debug:print 3 *default-log-port* "Using params r1=" r1 " r2=" r2 " s1=" s1 " s2=" s2)
(min (max (/ (expt r1 (* r2 s2 ratio)) s1) 0) 30))
(else
(debug:print 0 *default-log-port* "BAD exp-params, should be \"r1 r2 s1 s2\" but got " paramstr)
30)))))
(define (common:print-delay-table)
(let loop ((x 0))
(print x "," (common:get-delay x 1))
(if (< x 2)
(loop (+ x 0.1)))))
(define (get-cpu-load #!key (remote-host #f))
(car (common:get-cpu-load remote-host)))
;; (let* ((load-res (process:cmd-run->list "uptime"))
;; (load-rx (regexp "load average:\\s+(\\d+)"))
;; (cpu-load #f))
;; (for-each (lambda (l)
;; (let ((match (string-search load-rx l)))
;; (if match
;; (let ((newval (string->number (cadr match))))
;; (if (number? newval)
;; (set! cpu-load newval))))))
;; (car load-res))
;; cpu-load))
;; get values from cached info from dropping file in logs dir
;; e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 10))
(if *toppath*
(let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log")))
(let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))
(delfile (lambda ()
(debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
(delete-file* fullpath)
#f)))
(if (and (file-exists? fullpath)
(file-read-access? fullpath))
(handle-exceptions
exn
(begin
(debug:print 0 *default-log-port* "failed to get cached info from " fullpath ", exn=" exn)
#f)
(debug:print 2 *default-log-port* "reading file " fullpath)
(let ((real-age (- (current-seconds)(file-change-time fullpath))))
(let ((real-age (- (current-seconds)
(handle-exceptions
exn
(begin
(debug:print 1 *default-log-port* "Failed to read mod time on file "
fullpath ", using 0, exn=" exn)
0)
(file-change-time fullpath)))))
(if (< real-age age)
(handle-exceptions
exn
(delfile)
(let* ((res (with-input-from-file fullpath read)))
(if (eof-object? res)
(begin
(begin
(debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
(delete-file* fullpath)
#f)
(with-input-from-file fullpath read))
(delfile)
#f)
res)))
(begin
(debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it")
(debug:print-info 2 *default-log-port* "file " fullpath
" is too old (" real-age" seconds) to trust, skipping reading it")
#f))))
(begin
(debug:print 2 *default-log-port* "not reading file " fullpath)
#f)))
#f))
(define (common:write-cached-info key dtype dat)
|
︙ | | |
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
|
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
|
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
(define (common:wait-for-homehost-load maxnormload msg)
(let* ((hh-dat (if (common:on-homehost?) ;; if we are on the homehost then pass in #f so the calls are local.
#f
(common:get-homehost)))
(hh (if hh-dat (car hh-dat) #f)))
(common:wait-for-normalized-load maxnormload msg hh)))
(define *numcpus-cache* (make-hash-table))
(define (common:get-num-cpus remote-host)
(let* ((actual-host (or remote-host (get-host-name))))
;; hosts had better not be changing the number of cpus too often!
(or (hash-table-ref/default *numcpus-cache* actual-host #f)
(or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600))) ;; hosts had better not be changing the number of cpus too often!
(let* ((proc (lambda ()
(let loop ((numcpu 0)
(inl (read-line)))
(if (eof-object? inl)
(if (> numcpu 0)
numcpu
#f) ;; if zero return #f so caller knows that things are not working
(loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
(+ numcpu 1)
numcpu)
(read-line))))))
(result (if remote-host
(with-input-from-pipe
(conc "ssh " remote-host " cat /proc/cpuinfo")
proc)
(with-input-from-file "/proc/cpuinfo" proc))))
(if (and (number? result)
(> result 0))
(common:write-cached-info actual-host "num-cpus" result))
result))))
(let* ((numcpus (or (common:get-cached-info actual-host "num-cpus" age: (+ 2592000 (random 3600)))
(let* ((proc (lambda ()
(let loop ((numcpu 0)
(inl (read-line)))
(if (eof-object? inl)
(if (> numcpu 0)
numcpu
#f) ;; if zero return #f so caller knows that things are not working
(loop (if (string-match "^processor\\s+:\\s+\\d+$" inl)
(+ numcpu 1)
numcpu)
(read-line))))))
(result (if remote-host
(with-input-from-pipe
(conc "ssh " remote-host " cat /proc/cpuinfo")
proc)
(with-input-from-file "/proc/cpuinfo" proc))))
(if (and (number? result)
(> result 0))
(common:write-cached-info actual-host "num-cpus" result))
result))))
(hash-table-set! *numcpus-cache* actual-host numcpus)
numcpus))))
;; wait for normalized cpu load to drop below maxload
;;
(define (common:wait-for-normalized-load maxnormload msg remote-host #!optional (rem-tries 5))
(let ((num-cpus (common:get-num-cpus remote-host)))
(if num-cpus
(common:wait-for-cpuload maxnormload num-cpus 15 msg: msg remote-host: remote-host)
|
︙ | | |
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
|
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
|
+
-
+
+
+
|
(effective-load (common:get-intercept first next))
(recommended-delay (common:get-delay effective-load numcpus))
(effective-host (or remote-host "localhost"))
(normalized-effective-load (/ effective-load numcpus))
(will-wait (> normalized-effective-load maxnormload)))
(if (> recommended-delay 0)
(let* ((actual-delay (min recommended-delay 30)))
(if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
(debug:print-info 0 *default-log-port* "Load control, delaying " actual-delay " seconds to maintain safe load.")
(debug:print-info 0 *default-log-port* "Load control, delaying "
actual-delay " seconds to maintain safe load. current normalized effective load is "
normalized-effective-load"."))
(thread-sleep! actual-delay)))
(cond
;; bad data, try again to get the data
((not will-wait)
(if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
(debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))
|
︙ | | |
︙ | | |
210
211
212
213
214
215
216
217
218
219
220
221
222
223
|
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
|
+
+
|
;;
(define (dboard:common-set-tabdat! commondat tabnum tabdat)
(hash-table-set!
(dboard:commondat-tabdats commondat)
tabnum
tabdat))
(define *updater-running* #f) ;; move this into one of the stucts
;; gets and calls updater list based on curr-tab-num
;;
(define (dboard:common-run-curr-updaters commondat #!key (tab-num #f))
(if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat
(let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat)))
(updaters (hash-table-ref/default (dboard:commondat-updaters commondat)
tnum
|
︙ | | |
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
|
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
|
-
+
+
+
|
(curr-updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum '())))
(hash-table-set! (dboard:commondat-updaters commondat)
tnum
(cons updater curr-updaters))))
;; data for each specific tab goes here
;;
(defstruct dboard:tabdat
(defstruct dboard:tabdat
;; flags
((already-running #f) : boolean) ;; the updater is already running. skip
;; runs
((allruns '()) : list) ;; list of dboard:rundat records
((allruns-by-id (make-hash-table)) : hash-table) ;; hash of run-id -> dboard:rundat records
((done-runs '()) : list) ;; list of runs already drawn
((not-done-runs '()) : list) ;; list of runs not yet drawn
(header #f) ;; header for decoding the run records
(keys #f) ;; keys for this run (i.e. target components)
|
︙ | | |
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
|
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
|
-
+
|
;;
;; NOTE: Yes, this is used
;;
(define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals)
(let* ((start-time (current-seconds))
(access-mode (dboard:tabdat-access-mode tabdat))
(num-to-get (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get")
"200")))
"50"))) ;; was 200, which is fine in a normal run area.
(states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat)))
(statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat)))
(do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab
(do-not-use-query-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab
(sort-info (get-curr-sort))
(sort-by (vector-ref sort-info 1))
(sort-order (vector-ref sort-info 2))
|
︙ | | |
714
715
716
717
718
719
720
721
722
723
724
725
726
727
|
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
|
+
|
;;
;; (debug:print 0 *default-log-port* "got-all: " got-all " multi-get: " multi-get " num-to-get: " num-to-get " (length tmptests): " (length tmptests) " db-modified: " db-modified " db-mod-time: " db-mod-time " db-path: " db-path)
(if got-all
(begin
(dboard:rundat-last-update-set! run-dat (- start-time 2))
(dboard:rundat-run-data-offset-set! run-dat 0))
(begin
;;; (thread-sleep! 0.25) ;; give the rest of the gui some time to update. <-- this did NOT help
(dboard:rundat-run-data-offset-set! run-dat
(+ num-to-get (dboard:rundat-run-data-offset run-dat)))))
(for-each
(lambda (tdat)
(let ((test-id (db:test-get-id tdat))
(state (db:test-get-state tdat)))
|
︙ | | |
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
|
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
|
+
+
-
+
|
;; this calls dboard:get-tests-for-run-duplicate for each run
;;
;; create a virtual table of all the tests
;; keypatts: ( (KEY1 "abc%def")(KEY2 "%") )
;;
(define (dboard:update-rundat tabdat runnamepatt numruns testnamepatt keypatts)
(dboard:tabdat-already-running-set! tabdat #t)
(let* (;; (already-running (dboard:tabdat-already-running tabdat))
(let* ((access-mode (dboard:tabdat-access-mode tabdat))
(access-mode (dboard:tabdat-access-mode tabdat))
(keys (dboard:tabdat-keys tabdat)) ;; (db:dispatch-query access-mode rmt:get-keys db:get-keys)))
(last-runs-update (- (dboard:tabdat-last-runs-update tabdat) 2))
(allruns (rmt:get-runs runnamepatt numruns (dboard:tabdat-start-run-offset tabdat) keypatts))
;;(allruns-tree (rmt:get-runs-by-patt (dboard:tabdat-keys tabdat) "%" #f #f #f #f))
(allruns-tree (rmt:get-runs-by-patt keys "%" #f #f #f #f 0)) ;; last-runs-update));;'("id" "runname")
(header (db:get-header allruns))
(runs (db:get-rows allruns)) ;; RA => Filtered as per runpatt selected
|
︙ | | |
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
|
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
|
-
+
+
-
-
+
+
-
-
-
-
+
+
|
(elapsed-time (- (current-seconds) start-time)))
(if (null? all-test-ids)
(hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id)
(hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct))
(if (or (null? tal)
(> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update
(begin
(when (> elapsed-time 2)
#;(when (> elapsed-time 2)
(debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.")
(let* ((old-val (iup:attribute *tim* "TIME"))
(new-val (number->string (inexact->exact (floor (* 2 (string->number old-val)))))))
(if (< (string->number new-val) 5000)
(begin
((debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
(iup:attribute-set! *tim* "TIME" new-val))))
(debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val)
(iup:attribute-set! *tim* "TIME" new-val)))))
)
(dboard:tabdat-allruns-set! tabdat new-res)
maxtests)
(if (> (dboard:rundat-run-data-offset run-struct) 0)
(loop run tal new-res newmaxtests) ;; not done getting data for this run
(loop (car tal)(cdr tal) new-res newmaxtests)))))))
(dboard:tabdat-filters-changed-set! tabdat #f)
(dboard:update-tree tabdat runs-hash header tb)))
(dboard:update-tree tabdat runs-hash header tb)
(dboard:tabdat-already-running-set! tabdat #f)))
(define *collapsed* (make-hash-table))
(define (toggle-hide lnum uidat) ; fulltestname)
(let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum))
(fulltestname (iup:attribute btn "TITLE"))
(parts (string-split fulltestname "("))
|
︙ | | |
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
|
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
|
-
-
-
-
|
sort-lb)))
)
;; insert extra widget here
(if extra-widget
extra-widget
(iup:hbox)) ;; empty widget
)))
(let* ((status-toggles (map (lambda (status)
(iup:toggle (conc status)
#:fontsize 8 ;; btn-fontsz ;; "10"
;; #:expand "HORIZONTAL"
#:action (lambda (obj val)
|
︙ | | |
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
|
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
|
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
|
;; removing the tabdat-values proc
;;
;; (define (tabdat-values tabdat)
;; runs update-rundat using the various filters from the gui
;;
(define (dashboard:do-update-rundat tabdat)
;; this seems like a good place to check for already running and skip if so
;;
;; (set! *updater-running* #t)
;;(if (dboard:tabdat-already-running tabdat)
;; (begin
;; (debug:print-info 0 *default-log-port* "Dashboard overloaded - updates will be slow, skipping update.")
;; (dboard:tabdat-target tabdat))
(dboard:update-rundat
tabdat
(hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%")
(dboard:tabdat-numruns tabdat)
(hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%")
;; generate key patterns from the target stored in tabdat
(let* ((dbkeys (dboard:tabdat-dbkeys tabdat)))
(let ((fres (if (dboard:tabdat-target tabdat)
(let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
(map (lambda (k v)(list k v)) dbkeys ptparts))
(let ((res '()))
(for-each (lambda (key)
(if (not (equal? key "runname"))
(let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
(if val (set! res (cons (list key val) res))))))
dbkeys)
res))))
fres))))
(let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%"))))
(map (lambda (k v)(list k v)) dbkeys ptparts))
(let ((res '()))
(for-each (lambda (key)
(if (not (equal? key "runname"))
(let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f)))
(if val (set! res (cons (list key val) res))))))
dbkeys)
res))))
fres)))
#;(set! *updater-running* #f))
(define (dashboard:runs-tab-updater commondat tab-num)
(debug:catch-and-dump
(lambda ()
(let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num))
(dbkeys (dboard:tabdat-dbkeys tabdat)))
(dashboard:do-update-rundat tabdat)
|
︙ | | |
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
|
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
|
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
commondat
(lambda ()
(dashboard:runs-tab-updater commondat 1))
tab-num: 2)
(iup:callback-set! *tim*
"ACTION_CB"
(lambda (time-obj)
(let ((update-is-running #f))
(mutex-lock! (dboard:commondat-update-mutex commondat))
(set! update-is-running (dboard:commondat-updating commondat))
(if (not update-is-running)
(dboard:commondat-updating-set! commondat #t))
(mutex-unlock! (dboard:commondat-update-mutex commondat))
(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
(begin
(dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
(mutex-lock! (dboard:commondat-update-mutex commondat))
(dboard:commondat-updating-set! commondat #f)
(mutex-unlock! (dboard:commondat-update-mutex commondat)))
(if (not *updater-running*)
(begin
;; (mutex-lock! (dboard:commondat-update-mutex commondat))
;; (set! update-is-running (dboard:commondat-updating commondat))
;;(if (not update-is-running)
;; (dboard:commondat-updating-set! commondat #t))
;;(mutex-unlock! (dboard:commondat-update-mutex commondat))
;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update
;; (begin
(set! *updater-running* #t)
(dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat)
(set! *updater-running* #f)
;; (mutex-lock! (dboard:commondat-update-mutex commondat))
;; (dboard:commondat-updating-set! commondat #f)
;; (mutex-unlock! (dboard:commondat-update-mutex commondat)))
))
1))))
(let ((th1 (make-thread (lambda ()
(thread-sleep! 1)
(dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab
) "update buttons once"))
|
︙ | | |
︙ | | |
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
|
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
|
-
+
|
exn
(begin
(print "Attempt to safely close sqlite3 db failed. Trying again. exn=" exn)
(thread-sleep! 3)
(sqlite3:interrupt! db)
(db:safely-close-sqlite3-db db stmtcache try-num: (- try-num 1)))
(if (sqlite3:database? db)
(let* ((stmts (hash-table-ref/default stmt-cache db #f)))
(let* ((stmts (and stmt-cache (hash-table-ref/default stmt-cache db #f))))
(if stmts (map sqlite3:finalize! (hash-table-values stmts)))
(sqlite3:finalize! db)
#t)
#f))))
;; close all opened run-id dbs
(define (db:close-all dbstruct)
|
︙ | | |
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
|
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
|
-
-
-
-
-
-
+
+
+
+
+
+
+
|
(thread-sleep! delay-handicap)
(debug:print-info 0 *default-log-port* "synthetic sync delay of "delay-handicap" seconds completed")
)
;; first pass implementation, just insert all changed rows
(for-each
(lambda (targdb)
(let* ((db (db:dbdat-get-db targdb))
(drp-trigger (if (member "last_update" field-names)
(db:drop-trigger db tablename)
#f))
(is-trigger-dropped (if (member "last_update" field-names)
(db:is-trigger-dropped db tablename) #f))
(let* ((db (db:dbdat-get-db targdb))
(drp-trigger (if (member "last_update" field-names)
(db:drop-trigger db tablename)
#f))
(is-trigger-dropped (if (member "last_update" field-names)
(db:is-trigger-dropped db tablename)
#f))
(stmth (sqlite3:prepare db full-ins)))
;; (db:delay-if-busy targdb) ;; NO WAITING
(if (member "last_update" field-names)
(debug:print-info 3 *default-log-port* "is-trigger-dropped: " is-trigger-dropped))
(for-each
(lambda (fromdat-lst)
(sqlite3:with-transaction
|
︙ | | |
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
|
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
|
-
+
-
+
-
+
-
+
-
+
-
+
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
|
(define db:trigger-list
(list (list "update_runs_trigger" "CREATE TRIGGER IF NOT EXISTS update_runs_trigger AFTER UPDATE ON runs
FOR EACH ROW
BEGIN
UPDATE runs SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )
(list "update_run_stats_trigger" "CREATE TRIGGER IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats
(list "update_run_stats_trigger" "CREATE TRIGGER IF NOT EXISTS update_run_stats_trigger AFTER UPDATE ON run_stats
FOR EACH ROW
BEGIN
UPDATE run_stats SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )
(list "update_tests_trigger" "CREATE TRIGGER IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests
(list "update_tests_trigger" "CREATE TRIGGER IF NOT EXISTS update_tests_trigger AFTER UPDATE ON tests
FOR EACH ROW
BEGIN
UPDATE tests SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )
(list "update_teststeps_trigger" "CREATE TRIGGER IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps
(list "update_teststeps_trigger" "CREATE TRIGGER IF NOT EXISTS update_teststeps_trigger AFTER UPDATE ON test_steps
FOR EACH ROW
BEGIN
UPDATE test_steps SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )
(list "update_test_data_trigger" "CREATE TRIGGER IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data
(list "update_test_data_trigger" "CREATE TRIGGER IF NOT EXISTS update_test_data_trigger AFTER UPDATE ON test_data
FOR EACH ROW
BEGIN
UPDATE test_data SET last_update=(strftime('%s','now'))
WHERE id=old.id;
END;" )))
(define (db:create-all-triggers dbstruct)
(db:with-db
dbstruct #f #f
(lambda (db)
(db:create-triggers db))))
(define (db:create-triggers db)
(for-each (lambda (key)
(sqlite3:execute db (cadr key)))
db:trigger-list))
(define (db:drop-all-triggers dbstruct)
(db:with-db
(db:with-db
dbstruct #f #f
(lambda (db)
(db:drop-triggers db))))
(db:drop-triggers db))))
(define (db:is-trigger-dropped db tbl-name)
(let* ((trigger-name (if (equal? tbl-name "test_steps")
"update_teststeps_trigger"
(conc "update_" tbl-name "_trigger"))))
(sqlite3:for-each-row
(lambda (name)
;(print name)
(set! res (vector name)))
db
"select name from sqlite_master where type = 'trigger' ;"
)))
(let* ((trigger-name (if (equal? tbl-name "test_steps")
"update_teststeps_trigger"
(conc "update_" tbl-name "_trigger")))
(res #f))
(sqlite3:for-each-row
(lambda (name)
(if (equal? name trigger-name)
(set! res #t)))
db
"SELECT name FROM sqlite_master WHERE type = 'trigger' ;"
)))
(define (db:drop-triggers db)
(for-each (lambda (key)
(sqlite3:execute db (conc "drop trigger " (car key))))
db:trigger-list))
(for-each
(lambda (key)
(sqlite3:execute db (conc "drop trigger if exists " (car key))))
db:trigger-list))
(define (db:drop-trigger db tbl-name)
(let* ((trigger-name (if (equal? tbl-name "test_steps")
"update_teststeps_trigger"
(conc "update_" tbl-name "_trigger"))))
(for-each (lambda (key)
(if (equal? (car key) trigger-name)
(sqlite3:execute db (conc "drop trigger " trigger-name))))
db:trigger-list)))
(let* ((trigger-name (if (equal? tbl-name "test_steps")
"update_teststeps_trigger"
(conc "update_" tbl-name "_trigger"))))
(for-each
(lambda (key)
(if (equal? (car key) trigger-name)
(sqlite3:execute db (conc "drop trigger if exists " trigger-name))))
db:trigger-list)))
(define (db:create-trigger db tbl-name)
(let* ((trigger-name (if (equal? tbl-name "test_steps")
"update_teststeps_trigger"
(conc "update_" tbl-name "_trigger"))))
(for-each (lambda (key)
(if (equal? (car key) trigger-name)
|
︙ | | |
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
|
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
|
-
+
|
id INTEGER PRIMARY KEY,
test_id INTEGER,
state TEXT DEFAULT 'new',
status TEXT DEFAULT 'n/a',
archive_type TEXT DEFAULT 'bup',
du INTEGER,
archive_path TEXT);")))
(print "creating trigges from init")
;; (print "creating trigges from init")
(db:create-triggers db)
db)) ;; )
;;======================================================================
;; A R C H I V E S
;;======================================================================
|
︙ | | |
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
|
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
|
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
(debug:print 0 *default-log-port* "ERROR: cannot read " infile)
(debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir)
#f
)
(with-input-from-file infile read-lines)
)))
;; check duration against test-run.dat file if it exists and update the value in
;; the db if necessary
;; select end_time-now from
;; (select testname,item_path,event_time+run_duration as
;; end_time,strftime('%s','now') as now from tests where state in
;; ('RUNNING','REMOTEHOSTSTART','LAUNCHED'));
;;
(define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)
(let* ((datf (conc run-dir ".mt_data/test-run.dat"))
(modt (if (and (file-exists? datf)
(file-read-access? datf))
(file-modification-time datf)
#f)) ;; (+ event-time run-duration))))
(alt-run-duration (if modt
(- modt event-time)
#f)))
(if (and alt-run-duration
(> alt-run-duration run-duration))
(begin
(debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration)
(db:with-db
dbstruct #f #f
(lambda (db)
(sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id)
#t)))
#f))) ;; #f = we did NOT adjust the time
(define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime)
(let* ((incompleted '())
(oldlaunched '())
(toplevels '())
;; The default running-deadtime is 720 seconds = 12 minutes.
;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30))
(deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime")))
|
︙ | | |
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
|
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
|
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
-
+
+
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
|
;; HOWEVER: this code in run:test seems to work fine
;; (> (- (current-seconds)(+ (db:test-get-event_time testdat)
;; (db:test-get-run_duration testdat)))
;; 600)
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
(if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
(debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
" event-time="event-time" run-duration="run-duration))))
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))
(debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id="
test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds)
" event-time="event-time" run-duration="run-duration)))))
stmth1
run-id running-deadtime) ;; default time 720 seconds
(sqlite3:for-each-row
(lambda (test-id run-dir uname testname item-path event-time run-duration)
(if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration))
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
" run-duration="run-duration)
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))
(if (and (equal? uname "n/a")
(equal? item-path "")) ;; this is a toplevel test
;; what to do with toplevel? call rollup?
(begin
(set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels))
(debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id))
(begin
(debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id
" exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time
" run-duration="run-duration)
(set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted))))))
stmth2
run-id remotehoststart-deadtime) ;; default time 230 seconds
;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config
;;
;; (db:delay-if-busy dbdat)
(sqlite3:for-each-row
|
︙ | | |
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
|
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
|
-
-
+
+
-
-
-
+
+
+
+
|
(string->number res)
#f)))
(if newres
newres
res))
res)))
(define (db:no-sync-close-db db stmtcache)
(db:safely-close-sqlite3-db db stmtcache))
(define (db:no-sync-close-db db stmt-cache)
(db:safely-close-sqlite3-db db stmt-cache))
;; transaction protected lock aquisition
;; either:
;; fails returns (#f . lock-creation-time)
;; succeeds (returns (#t . lock-creation-time)
;; use (db:no-sync-del! db keyname) to release the lock
;;
(define (db:no-sync-get-lock db-in keyname)
(let ((db (db:no-sync-db db-in)))
(sqlite3:with-transaction
db
(lambda ()
(handle-exceptions
exn
(let ((lock-time (current-seconds)))
(sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time)
`(#t . ,lock-time))
(let ((lock-time (current-seconds)))
(debug:print-info 2 *default-log-port* "db:no-sync-get-lock keyname=" keyname ", lock-time=" lock-time ", exn=" exn)
(sqlite3:execute db "INSERT INTO no_sync_metadat (var,val) VALUES(?,?);" keyname lock-time)
`(#t . ,lock-time))
`(#f . ,(sqlite3:first-result db "SELECT val FROM no_sync_metadat WHERE var=?;" keyname)))))))
;; use a global for some primitive caching, it is just silly to
;; re-read the db over and over again for the keys since they never
;; change
|
︙ | | |
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
|
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
|
-
+
+
|
(let loop ((hed (car header))
(tal (cdr header))
(n 0))
(if (equal? hed field)
(handle-exceptions
exn
(begin
(debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row=" row " header=" header " field=" field ", exn=" exn)
(debug:print 0 *default-log-port* "WARNING: attempt to read non-existant field, row="
row " header=" header " field=" field ", exn=" exn)
#f)
(vector-ref row n))
(if (null? tal) #f (loop (car tal)(cdr tal)(+ n 1)))))))
;; Accessors for the header/data structure
;; get rows and header from
(define (db:get-header vec)(vector-ref vec 0))
|
︙ | | |
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
|
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
|
-
+
|
(let* ((run-ids (db:get-all-run-ids mtdb)))
(for-each
(lambda (run-id)
(let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id)))
(db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs)))
run-ids)))
;; Get test data using test_id, run-id is not used
;; Get test data using test_id, run-id is not used - but it will be!
;;
(define (db:get-test-info-by-id dbstruct run-id test-id)
(db:with-db
dbstruct
#f ;; run-id
#f
(lambda (db)
|
︙ | | |
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
|
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
|
-
+
|
(if dbdat
(let* ((dbpath (db:dbdat-get-path dbdat))
(db (db:dbdat-get-db dbdat)) ;; we'll return this so (db:delay--if-busy can be called inline
(dbfj (conc dbpath "-journal")))
(if (handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj)
(debug:print-info 0 *default-log-port* "WARNING: failed to test for existance of " dbfj ", exn=" exn)
(thread-sleep! 1)
(db:delay-if-busy count (- count 1)))
(common:file-exists? dbfj))
(case count
((6)
(thread-sleep! 0.2)
(db:delay-if-busy count: 5))
|
︙ | | |
︙ | | |
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
-
+
+
|
;;
;; You should have received a copy of the GNU General Public License
;; along with Megatest. If not, see <http://www.gnu.org/licenses/>.
;; strftime('%m/%d/%Y %H:%M:%S','now','localtime')
(use (prefix sqlite3 sqlite3:) srfi-1 posix regex regex-case srfi-69 (srfi 18)
posix-extras directory-utils pathname-expand typed-records format sxml-serializer sxml-modifications)
posix-extras directory-utils pathname-expand typed-records format sxml-serializer
sxml-modifications matchable)
(declare (unit runs))
(declare (uses db))
(declare (uses common))
(declare (uses items))
(declare (uses runconfig))
(declare (uses tests))
|
︙ | | |
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
-
+
+
+
+
+
+
|
;;
(defstruct runs:dat
reglen regfull
runname max-concurrent-jobs run-id
test-patts required-tests test-registry
registry-mutex flags keyvals run-info all-tests-registry
can-run-more-tests
;; stores results from last runs:can-run-more-tests
(can-run-more-tests #f) ;; (list can-run-more-flag num-running num-running-in-jobgroup max-concurrent-jobs job-group-limit)
((can-run-more-tests-count 0) : fixnum)
(last-fuel-check 0) ;; time when we last checked fuel
(beginning-of-time (current-seconds))
(load-mgmt-function #f)
(wait-for-jobs-function #f)
(last-load-check-time 0)
(last-jobs-check-time 0)
)
(defstruct runs:testdat
hed tal reg reruns test-record
test-name item-path jobgroup
waitons testmode newtal itemmaps prereqs-not-met)
|
︙ | | |
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
-
+
|
((< age 10) #t)
(else #f))))
lock-files)))
(if fresh-locks
(begin
(if (runs:lownoise "runners-softlock-wait" 360)
(debug:print-info 0 *default-log-port* "Other runners in flight, giving up some time..."))
(thread-sleep! 10))
(thread-sleep! 2))
(begin
(if (runs:lownoise "runners-softlock-nowait" 360)
(debug:print-info 0 *default-log-port* "No runners in flight, updating softlock"))
(let* ((ouf (open-output-file my-lock-file)))
(with-output-to-port ouf
(lambda ()(print (current-seconds))))
(close-output-port ouf))))
|
︙ | | |
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
|
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
|
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
+
|
(define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
;; Take advantage of a good place to exit if running the one-pass methodology
(if (and (> (runs:dat-can-run-more-tests-count runsdat) 20)
(args:get-arg "-one-pass"))
(exit 0))
(thread-sleep! (cond ;; BB: check with Matt. Should this sleep move
;; to cond clauses below where we determine we
;; have too many jobs running rather than each
;; time the and condition above is true (which
;; seems like always)?
((< (- (current-seconds)(runs:dat-beginning-of-time runsdat)) 30) ;; for the first 30 seconds do not throttle in any way
0)
((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time
(if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ..."))
10) ;; obviously haven't had any work to do for a while
(else 0)))
;; ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero?
;; (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)
;; )))
(let* ((num-running (rmt:get-count-tests-running run-id #f)) ;; fastmode=no
(num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup))
(job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup)))
(if (string? jobg-count)
(string->number jobg-count)
jobg-count))))
(if (> (+ num-running num-running-in-jobgroup) 0)
|
︙ | | |
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
|
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
|
-
+
|
(debug:print-info 0 *default-log-port* "filtering initial test list with tagexpr: " (args:get-arg "-tagexpr") " => " allowed-tests)
));; tests will be ANDed with this list
;; register this run in monitor.db
(rmt:tasks-add "run-tests" user target runname test-patts task-key) ;; params)
(rmt:tasks-set-state-given-param-key task-key "running")
(common:telemetry-log "run-tests"
#;(common:telemetry-log "run-tests"
payload:
`( (target . ,target)
(run-name . ,runname)
(test-patts . ,test-patts) ) )
;; Now generate all the tests lists
|
︙ | | |
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
|
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
|
-
+
-
+
+
+
+
+
|
((and (null? fails) ;; have not-started tests, but unable to run them. everything looks completed with no prospect of unsticking something that is stuck. we should mark hed as moribund and exit or continue if there are more tests to consider
(null? prereq-fails)
(null? non-completed))
(debug:print-info 4 *default-log-port* "cond branch - " "ei-4")
(if (runs:can-keep-running? hed 20)
(begin
(runs:inc-cant-run-tests hed)
(debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0)) ;;
(debug:print-info 0 *default-log-port* "no fails in prerequisites for " hed " but also none running, keeping " hed " for now. Try count: " (hash-table-ref/default *seen-cant-run-tests* hed 0) ", going to wait 60 sec.") ;;
;; getting here likely means the system is way overloaded, kill a full minute before continuing
(thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
;; (thread-sleep! 60) ;; TODO: gate by normalized server load > 1.0 (maxload config thing) CHECKTHIS!!!
;; No runsdat, can't do this yet
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
;;
(thread-sleep! 5) ;; TODO: gate by normalized server load > 1.0 (maxload config thing)
;; num-retries code was here
;; we use this opportunity to move contents of reg to tal
(list (car newtal)(append (cdr newtal) reg) '() reruns)) ;; an issue with prereqs not yet met?
(begin
(debug:print-info 1 *default-log-port* "no fails in prerequisites for " hed " but nothing seen running in a while, dropping test " hed " from the run queue")
(let ((test-id (rmt:get-test-id run-id hed "")))
(if test-id (mt:test-set-state-status-by-id run-id test-id "NOT_STARTED" "TIMED_OUT" "Nothing seen running in a while.")))
|
︙ | | |
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
|
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
|
+
+
+
+
+
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
|
(if (vector? t)
(conc (db:test-get-state t) "/" (db:test-get-status t))
(conc " WARNING: t is not a vector=" t )))
prereqs-not-met)
", ") ") fails: " fails
"\nregistered? " (hash-table-ref/default test-registry (db:test-make-full-name test-name item-path) #f))
;; well, first lets see if cpu load throttling is enabled. If so wait around until the
;; average cpu load is under the threshold before continuing
;;
(if (runs:dat-load-mgmt-function runsdat)
((runs:dat-load-mgmt-function runsdat))
(runs:dat-load-mgmt-function-set!
runsdat
(lambda ()
;; jobtools maxload is useful for where the full Megatest run is done on one machine
(if (and (not (common:on-homehost?))
maxload) ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
(common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
(if maxhomehostload
(common:wait-for-homehost-load maxhomehostload
(conc "Waiting for homehost load to drop below normalized value of " maxhomehostload))))))
(if (and (not (null? prereqs-not-met))
(runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
(debug:print-info 2 *default-log-port* "waiting on tests; " (string-intersperse (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met) ", ")))
;; Don't know at this time if the test have been launched at some time in the past
;; i.e. is this a re-launch?
|
︙ | | |
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
|
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
|
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
+
+
+
-
+
-
+
-
-
-
|
(null? non-completed)
(not (member 'exclusive testmode)))))
;; (hash-table-delete! *max-tries-hash* (db:test-make-full-name test-name item-path))
;; we are going to reset all the counters for test retries by setting a new hash table
;; this means they will increment only when nothing can be run
(set! *max-tries-hash* (make-hash-table))
;; well, first lets see if cpu load throttling is enabled. If so wait around until the
;; average cpu load is under the threshold before continuing
;; jobtools maxload is useful for where the full Megatest run is done on one machine
(if maxload ;; only gate if maxload is specified, NOTE: maxload is normalized, i.e. load=1 means all cpus fully utilized
(common:wait-for-normalized-load maxload "Waiting for load to drop before starting more tests" #f))
;; jobtools maxhomehostload is intended to prevent overloading on the homehost which can cause database corruption issues
(if maxhomehostload
(common:wait-for-homehost-load maxhomehostload
(conc "Waiting for homehost load to drop below normalized value of " maxhomehostload)))
(run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry)
(run:test run-id run-info keyvals runname test-record flags #f test-registry all-tests-registry runsdat testdat)
(runs:incremental-print-results run-id)
(hash-table-set! test-registry (db:test-make-full-name test-name item-path) 'running)
(runs:shrink-can-run-more-tests-count runsdat) ;; DELAY TWEAKER (still needed?)
;; (thread-sleep! *global-delta*)
(if (or (not (null? tal))(not (null? reg)))
(runs:loop-values tal reg reglen regfull reruns)
(runs:loop-values tal reg reglen regfull reruns) ;; hed should be dropped at this time
#f))
;; must be we have unmet prerequisites
;;
(else
(debug:print 4 *default-log-port* "FAILS: " fails)
;; If one or more of the prereqs-not-met are FAIL then we can issue
;; a message and drop hed from the items to be processed.
;; (runs:mixed-list-testname-and-testrec->list-of-strings prereqs-not-met)
(if (and (not (null? prereqs-not-met))
(runs:lownoise (conc "waiting on tests " prereqs-not-met hed) 60))
(debug:print-info 1 *default-log-port* "waiting on tests; " (string-intersperse
(runs:mixed-list-testname-and-testrec->list-of-strings
prereqs-not-met) ", ")))
(if (or (null? fails)
(member 'toplevel testmode))
(begin
;; couldn't run, take a breather
(if (runs:lownoise "Waiting for more work to do..." 60)
(debug:print-info 0 *default-log-port* "Waiting for more work to do..."))
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(thread-sleep! 1)
(thread-sleep! 5)
(list (car newtal)(cdr newtal) reg reruns))
;; the waiton is FAIL so no point in trying to run hed ever again
(begin
(let ((my-test-id (rmt:get-test-id run-id test-name item-path)))
(mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))
(mt:test-set-state-status-by-id-unless-completed run-id my-test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites2"))
(if (or (not (null? reg))(not (null? tal)))
(if (vector? hed)
(begin
(debug:print 1 *default-log-port* "WARNING: Dropping test " test-name "/" item-path
" from the launch list as it has prerequistes that are FAIL")
(let ((test-id (rmt:get-test-id run-id hed "")))
(if test-id (mt:test-set-state-status-by-id-unless-completed run-id test-id "COMPLETED" "PREQ_FAIL" "Failed to run due to failed prerequisites")))
|
︙ | | |
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
|
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
|
-
-
-
-
-
-
-
-
-
|
testmode: testmode
newtal: newtal
itemmaps: itemmaps
;; prereqs-not-met: prereqs-not-met
)))
(runs:dat-regfull-set! runsdat regfull)
;; -- removed BB 17ww28 - no longer needed.
;; every 15 minutes verify the server is there for this run
;; (if (and (common:low-noise-print 240 "try start server" run-id)
;; (not (or (and *runremote*
;; (remote-server-url *runremote*)
;; (server:ping (remote-server-url *runremote*)))
;; (server:check-if-running *toppath*))))
;; (server:kind-run *toppath*))
(if (> num-running 0)
(set! last-time-some-running (current-seconds)))
(if (> (current-seconds)(+ last-time-some-running (or (configf:lookup *configdat* "setup" "give-up-waiting") 36000)))
(hash-table-set! *max-tries-hash* tfullname (+ (hash-table-ref/default *max-tries-hash* tfullname 0) 1)))
;; (debug:print 0 *default-log-port* "max-tries-hash: " (hash-table->alist *max-tries-hash*))
|
︙ | | |
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
|
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
|
-
+
|
"\n num-retries: " num-retries
"\n reruns: " reruns
"\n regfull: " regfull
"\n reglen: " reglen
"\n length reg: " (length reg)
)
(runs:parallel-runners-mgmt runsdat)
;; (runs:parallel-runners-mgmt runsdat)
;; check for hed in waitons => this would be circular, remove it and issue an
;; error
(if (member test-name waitons)
(begin
(debug:print-error 0 *default-log-port* "test " test-name " has listed itself as a waiton, please correct this!")
(set! waiton (filter (lambda (x)(not (equal? x hed))) waitons))))
|
︙ | | |
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
|
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
|
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
+
|
;;
((not items)
(debug:print-info 4 *default-log-port* "cond branch - " "rtq-2")
(debug:print-info 4 *default-log-port* "OUTER COND: (not items)")
(if (and (not (tests:match test-patts (tests:testqueue-get-testname test-record) item-path required: required-tests))
(not (null? tal)))
(loop (car tal)(cdr tal) reg reruns))
(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
;;(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
;; gonna try a strategy change here.
;;
;; check if can run more tests. if yes, continue, if no, rest until can run more
;; look at the test jobgroup and tot jobs running
;;
;; NOTE: This does NOT actually gate here, only captures the proc to be called later
;;
(if (not (runs:dat-wait-for-jobs-function runsdat))
(runs:dat-wait-for-jobs-function-set!
runsdat
(lambda (testdat-in)
(let* ((jobgroup (runs:testdat-jobgroup testdat-in))
(can-run-more-tests (runs:dat-can-run-more-tests runsdat))
(last-jobs-check-time (runs:dat-last-jobs-check-time runsdat))
(should-check-jobs (match can-run-more-tests
((can-run-more-flag num-running nr-in-jobgroup max-concurrent-jobs . params)
(if (< (- max-concurrent-jobs num-running) 25)
(begin
(debug:print-info 0 *default-log-port*
"less than 20 jobs headroom, ("max-concurrent-jobs
"-"num-running")>20. Forcing prelaunch check.")
#t)
#f))
(else #f)))) ;; no record yet
;; This would be a good place to block on homehost load
(if should-check-jobs
(let loop-can-run-more
((res (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
(remtries 1440)) ;; we can wait for up to two hours for jobs to get done
(match res
((run-more num-running . rem)
(if (or run-more
(< remtries 1))
(begin
(if (runs:lownoise "num-running" 30)
(debug:print-info 0 *default-log-port* "Have "num-running" tests of max " max-concurrent-jobs))
(runs:dat-can-run-more-tests-set! runsdat res)) ;; capture the result and then drop through
(begin
(if (runs:lownoise "num-running" 10)
(debug:print-info 0 *default-log-port* "Can't run more tests, have "num-running" tests of "
max-concurrent-jobs " allowed."))
(thread-sleep! 5) ;; if we've hit max concurrent jobs take a breather, nb// make this configurable
;; wait for load here
(if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(loop-can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)
(- remtries 1)))))))
)))))
;; I'm not clear on why prereqs are gathered here TODO: verfiy this is needed
(runs:testdat-prereqs-not-met-set! testdat (rmt:get-prereqs-not-met run-id waitons hed item-path mode: testmode itemmaps: itemmaps))
(runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)) ;; look at the test jobgroup and tot jobs running
(let ((loop-list (runs:process-expanded-tests runsdat testdat)))
;; I'm not clear on why we'd capture running job counts here TODO: verify this is needed
(runs:dat-can-run-more-tests-set! runsdat (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))
(let ((loop-list (runs:process-expanded-tests runsdat testdat))) ;; in process-expanded-tests ultimately run:test -> launch-test -> test actually running
(if loop-list (apply loop loop-list))))
;; items processed into a list but not came in as a list been processed
;;
((and (list? items) ;; thus we know our items are already calculated
(not itemdat)) ;; and not yet expanded into the list of things to be done
(debug:print-info 4 *default-log-port* "cond branch - " "rtq-3")
|
︙ | | |
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
|
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
|
-
+
-
-
+
+
+
+
+
-
+
-
-
|
(loop (car tal)(cdr tal) reg reruns)))
;; if items is a proc then need to run items:get-items-from-config, get the list and loop
;; - but only do that if resources exist to kick off the job
;; EXPAND ITEMS
((or (procedure? items)(eq? items 'have-procedure))
(debug:print-info 4 *default-log-port* "cond branch - " "rtq-4")
(let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
(let ((can-run-more (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs)))
(if (and (list? can-run-more)
(car can-run-more))
(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here
(car can-run-more)) ;; itemized test expanded here
(let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup
max-concurrent-jobs run-id waitons item-path
testmode test-record can-run-more items runname
tconfig reglen test-registry test-records itemmaps)))
(if loop-list
(apply loop loop-list)
(debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)
(debug:print-info 4 *default-log-port* " -- Can't expand hed="hed)))
)
)
;; if can't run more just loop with next possible test
(loop (car newtal)(cdr newtal) reg reruns))))
;; this case should not happen, added to help catch any bugs
((and (list? items) itemdat)
(debug:print-info 4 *default-log-port* "cond branch - " "rtq-5")
(debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this")
|
︙ | | |
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
|
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
|
+
+
+
-
+
|
(debug:print-info 4 *default-log-port* "Exiting loop with...\n hed=" hed "\n tal=" tal "\n reruns=" reruns))
))) ;; end loop on sorted test names
;; this is the point where everything is launched and now you can mark the run in metadata table as all launched
(rmt:set-var (conc "lunch-complete-" run-id) "yes")
;; now *if* -run-wait we wait for all tests to be done
;; Now wait for any RUNNING tests to complete (if in run-wait mode)
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(thread-sleep! 10) ;; I think there is a race condition here. Let states/statuses settle
(let wait-loop ((num-running (rmt:get-count-tests-running-for-run-id run-id #t)) ;; fastmode=yes
(prev-num-running 0))
;; (debug:print-info 13 *default-log-port* "num-running=" num-running ", prev-num-running=" prev-num-running)
(if (and (or (args:get-arg "-run-wait")
(equal? (configf:lookup *configdat* "setup" "run-wait") "yes"))
(> num-running 0))
(begin
;; Here we mark any old defunct tests as incomplete. Do this every fifteen minutes
;; (debug:print 0 *default-log-port* "Got here eh! num-running=" num-running " (> num-running 0) " (> num-running 0))
(if (> (current-seconds)(+ last-time-incomplete 900))
(let ((actual-num-running (rmt:get-count-tests-running-for-run-id run-id #f))) ;; fastmode=no
(debug:print-info 0 *default-log-port* "Marking stuck tests as INCOMPLETE while waiting for run " run-id
". Running as pid " (current-process-id) " on " (get-host-name))
(set! last-time-incomplete (current-seconds)) ;; FIXME, this might be causing slow down - use of set!
(rmt:find-and-mark-incomplete run-id #f)
(debug:print-info 0 *default-log-port* "run-wait specified, waiting on " actual-num-running
" tests in RUNNING, REMOTEHOSTSTART or LAUNCHED state at "
(time->string (seconds->local-time (current-seconds))))))
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(thread-sleep! 5)
(thread-sleep! 5) ;; (if (>= num-running max-concurrent-jobs) 5 1))
(wait-loop (rmt:get-count-tests-running-for-run-id run-id #t) ;; fastmode=yes
num-running))))
;; LET* ((test-record
;; we get here on "drop through". All done!
;; this is moved to runs:run-testes since this function is getting called twice to ensure everthing is completed.
;; (debug:print-info 0 *default-log-port* "Calling Post Hook")
;; (runs:run-post-hook run-id)
|
︙ | | |
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
|
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
|
-
+
|
(if (not (vector? t))
(conc t)
(conc (db:test-get-testname t) ":" (db:test-get-state t) "/" (db:test-get-status t))))
lst))
;; parent-test is there as a placeholder for when parent-tests can be run as a setup step
;;
(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry)
(define (run:test run-id run-info keyvals runname test-record flags parent-test test-registry all-tests-registry runsdat testdat-rec)
;; All these vars might be referenced by the testconfig file reader
;;
;; NEED to reprocess testconfig here, ensuring that item variables are available.
;; This is for Tal's issue with item-specific env vars not being set for use in skip.
;; HSD https://hsdes.intel.com/appstore/icf/index.html#/article?articleId=1408763273
;;
(let* ((test-name (tests:testqueue-get-testname test-record))
|
︙ | | |
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
|
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
|
-
-
+
+
+
|
(debug:print 2 *default-log-port* "WARN: Test not pre-created? test-name=" test-name ", item-path=" item-path ", run-id=" run-id)
(rmt:register-test run-id test-name item-path)
(set! test-id (rmt:get-test-id run-id test-name item-path))))
(debug:print-info 4 *default-log-port* "test-id=" test-id ", run-id=" run-id ", test-name=" test-name ", item-path=\"" item-path "\"")
(set! testdat (rmt:get-test-info-by-id run-id test-id))
(if (not testdat)
(begin
(debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in one second")
(thread-sleep! 1)
(debug:print-info 0 *default-log-port* "WARNING: server is overloaded, trying again in two seconds")
;; (if (runs:dat-load-mgmt-function runsdat)((runs:dat-load-mgmt-function runsdat)))
(thread-sleep! 2)
(loop)))))
(if (not testdat) ;; should NOT happen
(debug:print-error 0 *default-log-port* "failed to get test record for test-id " test-id))
(set! test-id (db:test-get-id testdat))
(if (common:file-exists? test-path)
(change-directory test-path)
(begin
|
︙ | | |
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
|
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
|
+
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
+
+
|
(if skip-test
(begin
(mt:test-set-state-status-by-id run-id test-id "COMPLETED" "SKIP" skip-test)
(debug:print-info 1 *default-log-port* "SKIPPING Test " full-test-name " due to " skip-test))
;;
;; Here the test is handed off to launch.scm for launch-test to complete the launch process
;;
(begin
;; wait for less than max jobs here
(if (runs:dat-wait-for-jobs-function runsdat)
((runs:dat-wait-for-jobs-function runsdat) testdat-rec))
(if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
(begin
(print "ERROR: Failed to launch the test. Exiting as soon as possible")
(set! *globalexitstatus* 1) ;;
(process-signal (current-process-id) signal/kill))))))))
(if (not (launch-test test-id run-id run-info keyvals runname test-conf test-name test-path itemdat flags))
(begin
(print "ERROR: Failed to launch the test. Exiting as soon as possible")
(set! *globalexitstatus* 1) ;;
(process-signal (current-process-id) signal/kill))
)
;; wait again here?
))))))
((KILLED)
(debug:print 1 *default-log-port* "NOTE: " full-test-name " is already running or was explictly killed, use -force to launch it.")
(hash-table-set! test-registry (db:test-make-full-name test-name test-path) 'DONOTRUN)) ;; KILLED))
((LAUNCHED REMOTEHOSTSTART RUNNING)
(debug:print 2 *default-log-port* "NOTE: " test-name " is already running"))
;; (if (> (- (current-seconds)(+ (db:test-get-event_time testdat)
;; (db:test-get-run_duration testdat)))
|
︙ | | |
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
|
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
|
-
+
|
(debug:print-info 2 *default-log-port* "new state " new-state ", new status " new-status )
(mt:test-set-state-status-by-id run-id test-id new-state new-status #f))
(if (not (null? tal))
(loop (car tal)(cdr tal))))
((run-wait)
;; BB TODO - manage has-subrun case
(debug:print-info 2 *default-log-port* "still waiting, " (length tests) " tests still running")
(thread-sleep! 10)
(thread-sleep! 5)
(let ((new-tests (proc-get-tests run-id)))
(if (null? new-tests)
(debug:print-info 1 *default-log-port* "Run completed according to zero tests matching provided criteria.")
(loop (car new-tests)(cdr new-tests)))))
((archive)
;; BB TODO - manage has-subrun case
(if (and run-dir (not toplevel-with-children))
|
︙ | | |
︙ | | |
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
|
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
|
-
-
-
+
+
+
-
-
-
-
-
+
+
+
+
+
-
-
-
-
-
-
+
+
+
+
+
+
|
;; otherwise attempt to create the logs dir and then
;; continue
(if (if (directory-exists? (conc areapath "/logs"))
'()
(if (file-write-access? areapath)
(begin
(condition-case
(create-directory (conc areapath "/logs") #t)
(exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
(exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list.")))
(create-directory (conc areapath "/logs") #t)
(exn (i/o file)(debug:print 0 *default-log-port* "ERROR: Cannot create directory at " (conc areapath "/logs")))
(exn ()(debug:print 0 *default-log-port* "ERROR: Unknown error attemtping to get server list. exn=" exn)))
(directory-exists? (conc areapath "/logs")))
'()))
(let* ((server-logs (glob (conc areapath "/logs/server-*.log")))
(num-serv-logs (length server-logs)))
(if (null? server-logs)
'()
(let loop ((hed (car server-logs))
(tal (cdr server-logs))
(res '()))
(let* ((mod-time (handle-exceptions
exn
(begin
(print "failed to get modification time on " hed ", exn=" exn)
(current-seconds)) ;; 0
(file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
exn
(begin
(print "failed to get modification time on " hed ", exn=" exn)
(current-seconds)) ;; 0
(file-modification-time hed))) ;; default to *very* old so log gets ignored if deleted
(down-time (- (current-seconds) mod-time))
(serv-dat (if (or (< num-serv-logs 10)
(< down-time 900)) ;; day-seconds))
(server:logf-get-start-info hed)
'())) ;; don't waste time processing server files not touched in the 15 minutes if there are more than ten servers to look at
(serv-rec (cons mod-time serv-dat))
(fmatch (string-match fname-rx hed))
(pid (if fmatch (string->number (list-ref fmatch 2)) #f))
(new-res (if (null? serv-dat)
res
(cons (append serv-rec (list pid)) res))))
(if (null? tal)
(if (and limit
(> (length new-res) limit))
new-res ;; (take new-res limit) <= need intelligent sorting before this will work
new-res)
(loop (car tal)(cdr tal) new-res)))))))))
(if (null? tal)
(if (and limit
(> (length new-res) limit))
new-res ;; (take new-res limit) <= need intelligent sorting before this will work
new-res)
(loop (car tal)(cdr tal) new-res)))))))))
(define (server:get-num-alive srvlst)
(let ((num-alive 0))
(for-each
(lambda (server)
(match-let (((mod-time host port start-time pid)
server))
|
︙ | | |
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
|
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
|
+
-
-
-
-
+
+
+
+
|
(> (length rec) 2))
(let ((start-time (list-ref rec 3))
(mod-time (list-ref rec 0)))
;; (print "start-time: " start-time " mod-time: " mod-time)
(and start-time mod-time
(> (- now start-time) 0) ;; been running at least 0 seconds
(< (- now mod-time) 16) ;; still alive - file touched in last 16 seconds
(or (not (configf:lookup *configdat* "server" "runtime")) ;; skip if not set
(< (- now start-time)
(+ (- (string->number (or (configf:lookup *configdat* "server" "runtime") "3600"))
180)
(random 360))) ;; under one hour running time +/- 180
(< (- now start-time)
(+ (- (string->number (configf:lookup *configdat* "server" "runtime"))
180)
(random 360)))) ;; under one hour running time +/- 180
))
#f))
srvlst)
(lambda (a b)
(< (list-ref a 3)
(list-ref b 3))))))
(if (> (length slst) nums)
|
︙ | | |
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
|
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
|
-
+
+
+
+
-
-
-
-
-
+
+
+
+
+
+
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
-
-
-
+
+
|
(let ((sig (server:mk-signature)))
(set! *my-client-signature* sig)
*my-client-signature*)))
;; wait for server=start-last to be three seconds old
;;
(define (server:wait-for-server-start-last-flag areapath)
(let* ((start-flag (conc areapath "/logs/server-start-last")))
(let* ((start-flag (conc areapath "/logs/server-start-last"))
;;; THIS INTERACTS WITH [server] timeout. Suggest using 0.1 or above for timeout (6 seconds)
(reftime (configf:lookup-number *configdat* "server" "idletime" default: 4))
(server-key (conc (get-host-name) "-" (current-process-id))))
(if (file-exists? start-flag)
(let* ((fmodtime (file-modification-time start-flag))
(reftime (+ 3 (random 5)))
(delta (- (current-seconds) fmodtime)))
(if (> delta reftime) ;; good enough
(begin
(debug:print-info 0 *default-log-port* "Ready to start server, last start: "
(delta (- (current-seconds) fmodtime))
(all-go (> delta reftime)))
(if (handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Something not right with soft server locking: exn=" exn)
fmodtime ", delta: " delta ", reftime: " reftime)
(system (conc "touch " start-flag))) ;; lazy but safe
#t)
(and all-go
(begin
(with-output-to-file start-flag
(lambda ()
(print server-key)))
(thread-sleep! 0.25)
(let ((res (with-input-from-file start-flag
(lambda ()
(read-line)))))
(equal? server-key res)))))
#t ;; (system (conc "touch " start-flag)) ;; lazy but safe
(begin
(debug:print-info 0 *default-log-port* "Gating server start, last start: "
fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go)
(thread-sleep! 5)
(server:wait-for-server-start-last-flag areapath))))
(thread-sleep! reftime)
(server:wait-for-server-start-last-flag areapath)))))))
(system (conc "touch " start-flag)))))
;; kind start up of servers, wait 40 seconds before allowing another server for a given
;; run-id to be launched
;;
(define (server:kind-run areapath)
;; look for $MT_RUN_AREA_HOME/logs/server-start-last
;; and wait for it to be at least 3 seconds old
(server:wait-for-server-start-last-flag areapath)
(if (not (server:check-if-running areapath)) ;; why try if there is already a server running?
(let* ((last-run-dat (hash-table-ref/default *server-kind-run* areapath '(0 0))) ;; callnum, whenrun
(call-num (car last-run-dat))
(when-run (cadr last-run-dat))
(run-delay (+ (case call-num
((0) 0)
((1) 20)
((2) 300)
(else 600))
(random 5))) ;; add a small random number just in case a lot of jobs hit the work hosts simultaneously
(lock-file (conc areapath "/logs/server-start.lock")))
(if (> (- (current-seconds) when-run) run-delay)
(begin
(let* ((start-flag (conc areapath "/logs/server-start-last")))
(common:simple-file-lock-and-wait lock-file expire-time: 15)
(system (conc "touch " start-flag)) ;; lazy but safe
(server:run areapath)
(thread-sleep! 2) ;; don't release the lock for at least a few seconds
(common:simple-file-release-lock lock-file)))
(hash-table-set! *server-kind-run* areapath (list (+ call-num 1)(current-seconds))))))
;; this one seems to be the general entry point
;;
|
︙ | | |
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
|
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
|
+
-
-
+
+
+
+
+
+
+
+
+
-
+
|
(debug:print 0 *default-log-port* "INFO: syncer thread NOT sleeping ; maybe time-to-exit...")))
(if (not (configf:lookup *configdat* "server" "disable-db-snapshot"))
(common:snapshot-file mtdbfile subdir: ".db-snapshot"))
(delete-file* staging-file)
(let* ((start-time (current-milliseconds))
(res (system sync-cmd))
(dbbackupfile (conc mtdbfile ".backup"))
(res2
(cond
((eq? 0 res)
(delete-file* (conc mtdbfile ".backup"))
((eq? 0 res )
(handle-exceptions
exn
(begin
(debug:print-info 0 *default-log-port* "Problem making backup db file, exn=" exn)
#t)
(if (file-exists? dbbackupfile)
(delete-file* dbbackupfile)
)
(if (eq? 0 (file-size sync-log))
(delete-file sync-log))
(system (conc "/bin/mv " staging-file " " mtdbfile))
)
(set! last-sync-seconds (/ (- (current-milliseconds) start-time) 1000))
(set! off-time (calculate-off-time
last-sync-seconds
(cond
((and (number? duty-cycle) (> duty-cycle 0) (< duty-cycle 1))
duty-cycle)
(else
|
︙ | | |