︙ | | | ︙ | |
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
|
((dboard) (conc updir progname))
((mtest) (conc updir progname))
((dashboard) progname)
(else exe)))))
'("../../" "../")))))
(if (null? res)
(begin
(debug:print 0 *current-log-port* "Failed to find this executable! Using what can be found on the path")
progname)
(car res))))
(define *common:logpro-exit-code->status-sym-alist*
'( ( 0 . pass )
( 1 . fail )
( 2 . warn )
|
|
|
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
|
((dboard) (conc updir progname))
((mtest) (conc updir progname))
((dashboard) progname)
(else exe)))))
'("../../" "../")))))
(if (null? res)
(begin
(debug:print 0 *default-log-port* "Failed to find this executable! Using what can be found on the path")
progname)
(car res))))
(define *common:logpro-exit-code->status-sym-alist*
'( ( 0 . pass )
( 1 . fail )
( 2 . warn )
|
︙ | | | ︙ | |
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
|
(common:version-signature))))
(define (common:api-changed?)
(not (equal? (substring (->string megatest-version) 0 4)
(substring (conc (common:get-last-run-version)) 0 4))))
(define (common:get-sync-lock-filepath)
(let* ((tmp-area (common:get-db-tmp-area))
(lockfile (conc tmp-area "/megatest.db.sync-lock")))
lockfile))
;; Move me elsewhere ...
;; RADT => Why do we meed the version check here, this is called only if version misma
;;
(define (common:cleanup-db dbstruct #!key (full #f))
(apply db:multi-db-sync
dbstruct
'schema
|
<
<
<
<
<
|
384
385
386
387
388
389
390
391
392
393
394
395
396
397
|
(common:version-signature))))
(define (common:api-changed?)
(not (equal? (substring (->string megatest-version) 0 4)
(substring (conc (common:get-last-run-version)) 0 4))))
;; Move me elsewhere ...
;; RADT => Why do we meed the version check here, this is called only if version misma
;;
(define (common:cleanup-db dbstruct #!key (full #f))
(apply db:multi-db-sync
dbstruct
'schema
|
︙ | | | ︙ | |
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
|
(define (common:get-testsuite-name)
(or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description.
(configf:lookup *configdat* "setup" "testsuite" )
(getenv "MT_TESTSUITE_NAME")
(pathname-file (or (if (string? *toppath* )
(pathname-file *toppath*)
#f)
(common:get-topath #f)))
"please-set-setup-area-name")) ;; (pathname-file (current-directory)))))
;; safe getting of toppath
(define (common:get-toppath areapath)
(or *toppath*
(if areapath
(begin
|
|
|
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
|
(define (common:get-testsuite-name)
(or (configf:lookup *configdat* "setup" "area-name") ;; megatest is a flexible tool, testsuite is too limiting a description.
(configf:lookup *configdat* "setup" "testsuite" )
(getenv "MT_TESTSUITE_NAME")
(pathname-file (or (if (string? *toppath* )
(pathname-file *toppath*)
#f)
(common:get-toppath #f)))
"please-set-setup-area-name")) ;; (pathname-file (current-directory)))))
;; safe getting of toppath
(define (common:get-toppath areapath)
(or *toppath*
(if areapath
(begin
|
︙ | | | ︙ | |
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
|
(args:get-arg "-server")))
;; (let ((ohh (common:on-homehost?))
;; (srv (args:get-arg "-server")))
;; (and ohh srv)))
;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv)
(define *wdnum* 0)
(define *wdnum*mutex (make-mutex))
(define (common:human-time)
(time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S"))
;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp
;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server)
;;
(define (common:readonly-watchdog dbstruct)
(thread-sleep! 0.05) ;; delay for startup
(debug:print-info 13 *default-log-port* "common:readonly-watchdog entered.")
|
<
<
<
<
<
<
<
|
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
|
(args:get-arg "-server")))
;; (let ((ohh (common:on-homehost?))
;; (srv (args:get-arg "-server")))
;; (and ohh srv)))
;; (debug:print-info 0 *default-log-port* "common:run-sync? ohh=" ohh ", srv=" srv)
(define (common:human-time)
(time->string (seconds->local-time (current-seconds)) "%Y-%m-%d %H:%M:%S"))
;; currently the primary job of the watchdog is to run the sync back to megatest.db from the db in /tmp
;; if we are on the homehost and we are a server (by definition we are on the homehost if we are a server)
;;
(define (common:readonly-watchdog dbstruct)
(thread-sleep! 0.05) ;; delay for startup
(debug:print-info 13 *default-log-port* "common:readonly-watchdog entered.")
|
︙ | | | ︙ | |
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
|
(handle-exceptions exn
(begin
(if message
(debug:print-info 0 *default-log-port* message))
#f) (thunk) ))
(define (common:file-exists? path-string #!key (silent #f))
;; this avoids stack dumps in the case where
;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q...
(common:false-on-exception (lambda () (file-exists? path-string))
message: (if (not silent)
(conc "Unable to access path: " path-string)
#f)
))
(define (common:directory-exists? path-string)
;;;; TODO: catch permission denied exceptions and emit appropriate warnings, eg: system error while trying to access file: "/nfs/pdx/disks/icf_env_disk001/bjbarcla/gwa/issues/mtdev/randy-slow/reproduce/q...
(common:false-on-exception (lambda () (directory-exists? path-string))
message: (conc "Unable to access path: " path-string)
))
;; does the directory exist and do we have write access?
;;
;; returns the directory or #f
|
<
<
|
|
|
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
|
(handle-exceptions exn
(begin
(if message
(debug:print-info 0 *default-log-port* message))
#f) (thunk) ))
(define (common:file-exists? path-string #!key (silent #f))
;;;; TODO: catch permission denied exceptions and emit appropriate warnings. Might be able to get rid of with chicken 5?
(common:false-on-exception (lambda () (file-exists? path-string))
message: (if (not silent)
(conc "Unable to access path: " path-string)
#f)
))
(define (common:directory-exists? path-string)
;;;; TODO: catch permission denied exceptions and emit appropriate warnings
(common:false-on-exception (lambda () (directory-exists? path-string))
message: (conc "Unable to access path: " path-string)
))
;; does the directory exist and do we have write access?
;;
;; returns the directory or #f
|
︙ | | | ︙ | |
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
|
;; get values from cached info from dropping file in logs dir
;; e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 10))
(if *toppath*
(let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))
(delfile (lambda ()
(debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
(delete-file* fullpath)
#f)))
(if (and (file-exists? fullpath)
(file-read-access? fullpath))
(handle-exceptions
exn
|
|
|
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
|
;; get values from cached info from dropping file in logs dir
;; e.g. key is host and dtype is normalized-load
;;
(define (common:get-cached-info key dtype #!key (age 10))
(if *toppath*
(let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))
(delfile (lambda (exn)
(debug:print-info 1 *default-log-port* " removing bad file " fullpath ", exn=" exn)
(delete-file* fullpath)
#f)))
(if (and (file-exists? fullpath)
(file-read-access? fullpath))
(handle-exceptions
exn
|
︙ | | | ︙ | |
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
|
(debug:print 1 *default-log-port* "Failed to read mod time on file "
fullpath ", using 0, exn=" exn)
0)
(file-change-time fullpath)))))
(if (< real-age age)
(handle-exceptions
exn
(delfile)
(let* ((res (with-input-from-file fullpath read)))
(if (eof-object? res)
(begin
(delfile)
#f)
res)))
(begin
(debug:print-info 2 *default-log-port* "file " fullpath
" is too old (" real-age" seconds) to trust, skipping reading it")
#f))))
(begin
|
|
|
|
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
|
(debug:print 1 *default-log-port* "Failed to read mod time on file "
fullpath ", using 0, exn=" exn)
0)
(file-change-time fullpath)))))
(if (< real-age age)
(handle-exceptions
exn
(delfile exn)
(let* ((res (with-input-from-file fullpath read)))
(if (eof-object? res)
(begin
(delfile "n/a")
#f)
res)))
(begin
(debug:print-info 2 *default-log-port* "file " fullpath
" is too old (" real-age" seconds) to trust, skipping reading it")
#f))))
(begin
|
︙ | | | ︙ | |
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
|
;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
;; etc.
(effective-load (common:get-intercept first next))
(recommended-delay (common:get-delay effective-load numcpus))
(effective-host (or remote-host "localhost"))
(normalized-effective-load (/ effective-load numcpus))
(will-wait (> normalized-effective-load maxnormload)))
(if (> recommended-delay 0)
(let* ((actual-delay (min recommended-delay 30)))
(if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
(debug:print-info 0 *default-log-port* "Load control, delaying "
actual-delay " seconds to maintain safe load. current normalized effective load is "
normalized-effective-load"."))
(thread-sleep! actual-delay)))
(cond
;; bad data, try again to get the data
((not will-wait)
(if (common:low-noise-print 30 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
(debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))
((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
(> num-tries 0))
(debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load "
first ", we'll sleep 10s and try " num-tries " more times.")
(thread-sleep! 10)
(common:wait-for-cpuload maxnormload numcpus-in
count: count remote-host: remote-host num-tries: (- num-tries 1)))
;; need to wait for load to drop
((and will-wait ;; (> first adjmaxload)
(> count 0))
(debug:print-info 0 *default-log-port*
"Delaying 15" ;; adjwait
" seconds due to normalized effective load " normalized-effective-load ;; first
" exceeding max of " adjmaxload
|
|
|
|
|
>
>
|
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
|
;; effective load accounts for load jumps, this should elminate all the first-next-avg, adjwait, load-jump-limit
;; etc.
(effective-load (common:get-intercept first next))
(recommended-delay (common:get-delay effective-load numcpus))
(effective-host (or remote-host "localhost"))
(normalized-effective-load (/ effective-load numcpus))
(will-wait (> normalized-effective-load maxnormload)))
(if (> recommended-delay 1)
(let* ((actual-delay (min recommended-delay 30)))
(if (common:low-noise-print 30 (conc (round actual-delay) "-safe-load"))
(debug:print-info 0 *default-log-port* "Load control, delaying "
actual-delay " seconds to maintain safe load. current normalized effective load is "
normalized-effective-load". maxnormload = " maxnormload " numcpus = " numcpus " loadavg = " loadavg " effective-load = " effective-load))
(thread-sleep! actual-delay)))
(cond
;; bad data, try again to get the data
((not will-wait)
(if (common:low-noise-print 3600 (conc (round normalized-effective-load) "-load-acceptable-" effective-host))
(debug:print 0 *default-log-port* "Effective load on " effective-host " is acceptable at " effective-load " continuing.")))
((and (< first 0) ;; this indicates the loadavg data is bad - machine may not be reachable
(> num-tries 0))
(debug:print 0 *default-log-port* "WARNING: received bad data from get-cpu-load "
first ", we'll sleep 10s and try " num-tries " more times.")
(thread-sleep! 10)
(common:wait-for-cpuload maxnormload numcpus-in
count: count remote-host: remote-host num-tries: (- num-tries 1)))
;; need to wait for load to drop
((and will-wait ;; (> first adjmaxload)
(> count 0))
(debug:print-info 0 *default-log-port*
"Delaying 15" ;; adjwait
" seconds due to normalized effective load " normalized-effective-load ;; first
" exceeding max of " adjmaxload
|
︙ | | | ︙ | |