Overview
Comment: | Cherrypicked inter-test-delay and cached load to 10s |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | v1.65-broken |
Files: | files | file ages | folders |
SHA1: |
aa29985039a593ff4c56ddc64870c8e2 |
User & Date: | mrwellan on 2020-05-01 20:30:37 |
Other Links: | branch diff | manifest | tags |
Context
2020-05-02
| ||
22:09 | Work on Ubuntu check-in: 268055792a user: matt tags: v1.66-ubuntu | |
2020-05-01
| ||
22:43 | Merged fixes to makefile check-in: 0a7ddadc4d user: jmoon18 tags: v1.65-broken | |
20:30 | Cherrypicked inter-test-delay and cached load to 10s check-in: aa29985039 user: mrwellan tags: v1.65-broken | |
2020-04-30
| ||
16:22 | rewrite absurdly long log file names if over 250 chars long check-in: 0a9e690b28 user: mrwellan tags: v1.65-broken | |
Changes
Modified common.scm from [d0760d6348] to [7eb16cae49].
︙ | ︙ | |||
484 485 486 487 488 489 490 | (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) (handle-exceptions exn (begin | | | | > | 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 | (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "300")))) ;; name -> age (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) (handle-exceptions exn (begin (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore.") (debug:print 2 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) ;; (print-call-chain (current-error-port)) ;; ) (let* ((fullname (conc "logs/" file)) (mod-time (file-modification-time fullname)) (file-age (- (current-seconds) mod-time))) (hash-table-set! all-files file mod-time) (if (or (and (string-match "^.*.log" file) (> (file-size fullname) 200000)) (and (string-match "^server-.*.log" file) |
︙ | ︙ | |||
1697 1698 1699 1700 1701 1702 1703 | ;; (set! cpu-load newval)))))) ;; (car load-res)) ;; cpu-load)) ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; | | | | 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 | ;; (set! cpu-load newval)))))) ;; (car load-res)) ;; cpu-load)) ;; get values from cached info from dropping file in logs dir ;; e.g. key is host and dtype is normalized-load ;; (define (common:get-cached-info key dtype #!key (age 10)) (if *toppath* (let* ((fullpath (conc *toppath* "/.sysdata/" key "-" dtype ".log"))) (if (and (file-exists? fullpath) (file-read-access? fullpath)) (handle-exceptions exn #f (debug:print 2 *default-log-port* "reading file " fullpath) (let ((real-age (- (current-seconds)(file-change-time fullpath)))) (if (< real-age age) (with-input-from-file fullpath read) (begin (debug:print-info 2 *default-log-port* "file " fullpath " is too old (" real-age" seconds) to trust, skipping reading it") #f)))) (begin (debug:print 2 *default-log-port* "not reading file " fullpath) #f))) #f)) (define (common:write-cached-info key dtype dat) |
︙ | ︙ | |||
1984 1985 1986 1987 1988 1989 1990 | numcpu) (read-line)))))) (result (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/cpuinfo") proc) (with-input-from-file "/proc/cpuinfo" proc)))) | > > | | 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 | numcpu) (read-line)))))) (result (if remote-host (with-input-from-pipe (conc "ssh " remote-host " cat /proc/cpuinfo") proc) (with-input-from-file "/proc/cpuinfo" proc)))) (if (and (number? result) (> result 0)) (common:write-cached-info actual-host "num-cpus" result)) result)))) ;; wait for normalized cpu load to drop below maxload ;; (define (common:wait-for-normalized-load maxload msg remote-host #!optional (rem-tries 5)) (let ((num-cpus (common:get-num-cpus remote-host))) (if num-cpus |
︙ | ︙ | |||
2014 2015 2016 2017 2018 2019 2020 | maxload-in (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously | > > | | | 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 | maxload-in (max maxload-in 0.5))) ;; so maxload must be greater than 0.5 for now BUG - FIXME? (first (car loadavg)) (next (cadr loadavg)) (adjload (* maxload (max 1 numcpus))) ;; possible bug where numcpus (or could be maxload) is zero, crude fallback is to at least use 1 (loadjmp (- first next)) (adjwait (min (+ 300 (random 10)) (abs (* (+ (random 10)(/ (- 1000 count) 10) waitdelay) (- first adjload) )) )));; add some randomness to the time to break any alignment where netbatch dumps many jobs to machines simultaneously ;; let's let the user know once in a long while that load checking is happening but not constantly report it (if (> (random 100) 75) ;; about 25% of the time (debug:print-info 1 *default-log-port* "Checking cpuload on " (or remote-host "localhost") ", maxload: " maxload ", load: " first ", adjload: " adjload ", loadjmp: " loadjmp)) (cond ((and (> first adjload) (> count 0)) (debug:print-info 0 *default-log-port* "server start delayed " adjwait " seconds due to load " first " exceeding max of " adjload " on server " (or remote-host (get-host-name)) " (normalized load-limit: " maxload ") " (if msg msg "")) (thread-sleep! adjwait) (common:wait-for-cpuload maxload numcpus waitdelay count: (- count 1) msg: msg remote-host: remote-host)) ((and (> loadjmp numcpus) |
︙ | ︙ |
Modified megatest.scm from [ee60e5eddb] to [4b27fb4644].
︙ | ︙ | |||
238 239 240 241 242 243 244 | -refdb2dat refdb : convert refdb to sexp or to format specified by s-dumpmode formats: perl, ruby, sqlite3, csv (for csv the -o param will substitute %s for the sheet name in generating multiple sheets) -o : output file for refdb2dat (defaults to stdout) -archive cmd : archive runs specified by selectors to one of disks specified in the [archive-disks] section. | | > > | 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 | -refdb2dat refdb : convert refdb to sexp or to format specified by s-dumpmode formats: perl, ruby, sqlite3, csv (for csv the -o param will substitute %s for the sheet name in generating multiple sheets) -o : output file for refdb2dat (defaults to stdout) -archive cmd : archive runs specified by selectors to one of disks specified in the [archive-disks] section. cmd: keep-html, restore, save, save-remove, get (use -dest to set destination), -include and -exclude to include or exclude files) -generate-html : create a simple html dashboard for browsing your runs -generate-html-structure : create a top level html veiw to list targets/runs and a Run view within each run directory. -list-run-time : list time requered to complete runs. It supports following switches -run-patt <patt> -target-patt <patt> -dumpmode <csv,json,plain-text> -list-test-time : list time requered to complete each test in a run. It following following arguments -runname <patt> -target <patt> -dumpmode <csv,json,plain-text> -syscheck : do some very basic checks; write access and space in tmp, home, runs, links and |
︙ | ︙ |
Modified runs.scm from [c265f57285] to [04a7c3fd06].
︙ | ︙ | |||
235 236 237 238 239 240 241 | (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) | | > > > > | < | | > > | 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | (define (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs) ;; Take advantage of a good place to exit if running the one-pass methodology (if (and (> (runs:dat-can-run-more-tests-count runsdat) 20) (args:get-arg "-one-pass")) (exit 0)) (thread-sleep! (cond ;; BB: check with Matt. Should this sleep move ;; to cond clauses below where we determine we ;; have too many jobs running rather than each ;; time the and condition above is true (which ;; seems like always)? ((> (runs:dat-can-run-more-tests-count runsdat) 20) ;; original intent was - save cycles, wait a long time (if (runs:lownoise "waiting on tasks" 60)(debug:print-info 2 *default-log-port* "waiting for tasks to complete, sleeping briefly ...")) 10) ;; obviously haven't had any work to do for a while (else ;; if have a number for inter-test-delay, use it, else don't delay much, maybe even zero? (configf:lookup-number *configdat* "setup" "inter-test-delay" default: 0.01)))) (let* ((num-running (rmt:get-count-tests-running run-id)) (num-running-in-jobgroup (rmt:get-count-tests-running-in-jobgroup run-id jobgroup)) (job-group-limit (let ((jobg-count (configf:lookup *configdat* "jobgroups" jobgroup))) (if (string? jobg-count) (string->number jobg-count) jobg-count)))) |
︙ | ︙ |