Overview
Comment: | Cross-pollinated v1.65 run-testdat work. This likely still does not pass obfusc-skel. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | v1.65-sidework |
Files: | files | file ages | folders |
SHA1: |
b17e77a3ea5b2ad32ae3d904f2b11bfc |
User & Date: | matt on 2020-10-05 23:07:30 |
Other Links: | branch diff | manifest | tags |
Context
2020-10-05
| ||
23:07 | Cross-pollinated v1.65 run-testdat work. This likely still does not pass obfusc-skel. Closed-Leaf check-in: b17e77a3ea user: matt tags: v1.65-sidework | |
22:49 | Do not exit on failure to create directory - race conditons on NFS cause false fail scenarios - just keep going and cross your fingers... (cherrypicked from v1.6572) check-in: 05b253a452 user: matt tags: v1.65-sidework | |
Changes
Modified TODO from [da5eae4898] to [dcd0f52bc7].
︙ | ︙ | |||
14 15 16 17 18 19 20 21 22 23 24 25 26 27 | # # You should have received a copy of the GNU General Public License # along with Megatest. If not, see <http://www.gnu.org/licenses/>. TODO ==== WW15 . fill newview matrix with data, filter pipeline gui elements . improve [script], especially indent handling WW16 . split db into megatest.db (runs etc.) db/<something>.db . release basic newview implementation | > > > > > > > | 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | # # You should have received a copy of the GNU General Public License # along with Megatest. If not, see <http://www.gnu.org/licenses/>. TODO ==== WW38 . Add test_rundat to no-sync ==> correction, put in <testdir>/.meta/test-run.dat . Add STATE/STATUS transitions to .meta/test-run.dat or similar . Swizzle update-test-rundat to operate on no-sync . Swizzle update-run-duration, -uname-host and cpuload-diskfree to no-sync . On state/status change update tests table with duration WW15 . fill newview matrix with data, filter pipeline gui elements . improve [script], especially indent handling WW16 . split db into megatest.db (runs etc.) db/<something>.db . release basic newview implementation |
︙ | ︙ |
Modified common.scm from [33c7316880] to [a37bf38bb2].
︙ | ︙ | |||
499 500 501 502 503 504 505 | ;; logs directory you wish to log-rotate. ;; (define (common:rotate-logs) (let* ((all-files (make-hash-table)) (stats (make-hash-table)) (inc-stat (lambda (key) (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) | | | 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 | ;; logs directory you wish to log-rotate. ;; (define (common:rotate-logs) (let* ((all-files (make-hash-table)) (stats (make-hash-table)) (inc-stat (lambda (key) (hash-table-set! stats key (+ (hash-table-ref/default stats key 0) 1)))) (max-allowed (string->number (or (configf:lookup *configdat* "setup" "max-logfiles") "600")))) ;; name -> age (if (not (directory-exists? "logs"))(create-directory "logs")) (directory-fold (lambda (file rem) (handle-exceptions exn (begin (debug:print-info 2 *default-log-port* "unable to rotate log " file ", probably handled by another process, this is safe to ignore. exn=" exn) |
︙ | ︙ |
Modified dashboard.scm from [935bf4d2df] to [d4733659e1].
︙ | ︙ | |||
210 211 212 213 214 215 216 217 218 219 220 221 222 223 | ;; (define (dboard:common-set-tabdat! commondat tabnum tabdat) (hash-table-set! (dboard:commondat-tabdats commondat) tabnum tabdat)) ;; gets and calls updater list based on curr-tab-num ;; (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f)) (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat (let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat))) (updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum | > > | 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | ;; (define (dboard:common-set-tabdat! commondat tabnum tabdat) (hash-table-set! (dboard:commondat-tabdats commondat) tabnum tabdat)) (define *updater-running* #f) ;; move this into one of the stucts ;; gets and calls updater list based on curr-tab-num ;; (define (dboard:common-run-curr-updaters commondat #!key (tab-num #f)) (if (dboard:common-get-tabdat commondat tab-num: tab-num) ;; only update if there is a tabdat (let* ((tnum (or tab-num (dboard:commondat-curr-tab-num commondat))) (updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum |
︙ | ︙ | |||
238 239 240 241 242 243 244 | (curr-updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum '()))) (hash-table-set! (dboard:commondat-updaters commondat) tnum (cons updater curr-updaters)))) ;; data for each specific tab goes here ;; | | > > | 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 | (curr-updaters (hash-table-ref/default (dboard:commondat-updaters commondat) tnum '()))) (hash-table-set! (dboard:commondat-updaters commondat) tnum (cons updater curr-updaters)))) ;; data for each specific tab goes here ;; (defstruct dboard:tabdat ;; flags ((already-running #f) : boolean) ;; the updater is already running. skip ;; runs ((allruns '()) : list) ;; list of dboard:rundat records ((allruns-by-id (make-hash-table)) : hash-table) ;; hash of run-id -> dboard:rundat records ((done-runs '()) : list) ;; list of runs already drawn ((not-done-runs '()) : list) ;; list of runs not yet drawn (header #f) ;; header for decoding the run records (keys #f) ;; keys for this run (i.e. target components) |
︙ | ︙ | |||
643 644 645 646 647 648 649 | ;; ;; NOTE: Yes, this is used ;; (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals) (let* ((start-time (current-seconds)) (access-mode (dboard:tabdat-access-mode tabdat)) (num-to-get (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get") | | | 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 | ;; ;; NOTE: Yes, this is used ;; (define (dboard:get-tests-for-run-duplicate tabdat run-id run testnamepatt key-vals) (let* ((start-time (current-seconds)) (access-mode (dboard:tabdat-access-mode tabdat)) (num-to-get (string->number (or (configf:lookup *configdat* "setup" "num-tests-to-get") "50"))) ;; was 200, which is fine in a normal run area. (states (hash-table-keys (dboard:tabdat-state-ignore-hash tabdat))) (statuses (hash-table-keys (dboard:tabdat-status-ignore-hash tabdat))) (do-not-use-db-file-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-db-file-timestamps")) ;; this still hosts runs-summary-tab (do-not-use-query-timestamps #t) ;; (configf:lookup *configdat* "setup" "do-not-use-query-timestamps")) ;; this no longer troubles runs-summary-tab (sort-info (get-curr-sort)) (sort-by (vector-ref sort-info 1)) (sort-order (vector-ref sort-info 2)) |
︙ | ︙ | |||
899 900 901 902 903 904 905 | (elapsed-time (- (current-seconds) start-time))) (if (null? all-test-ids) (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id) (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update (begin | | > | | < < < | > | 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 | (elapsed-time (- (current-seconds) start-time))) (if (null? all-test-ids) (hash-table-delete! (dboard:tabdat-allruns-by-id tabdat) run-id) (hash-table-set! (dboard:tabdat-allruns-by-id tabdat) run-id run-struct)) (if (or (null? tal) (> elapsed-time 2)) ;; stop loading data after 5 seconds, on the next call more data *should* be loaded since get-tests-for-run uses last update (begin #;(when (> elapsed-time 2) (debug:print 0 *default-log-port* "NOTE: updates are taking a long time, " elapsed-time "s elapsed.") (let* ((old-val (iup:attribute *tim* "TIME")) (new-val (number->string (inexact->exact (floor (* 2 (string->number old-val))))))) (if (< (string->number new-val) 5000) (begin (debug:print 0 *default-log-port* "NOTE: increasing poll interval from "old-val" to "new-val) (iup:attribute-set! *tim* "TIME" new-val))))) (dboard:tabdat-allruns-set! tabdat new-res) maxtests) (if (> (dboard:rundat-run-data-offset run-struct) 0) (loop run tal new-res newmaxtests) ;; not done getting data for this run (loop (car tal)(cdr tal) new-res newmaxtests))))))) (dboard:tabdat-filters-changed-set! tabdat #f) (dboard:update-tree tabdat runs-hash header tb) (dboard:tabdat-already-running-set! tabdat #f))) (define *collapsed* (make-hash-table)) (define (toggle-hide lnum uidat) ; fulltestname) (let* ((btn (vector-ref (dboard:uidat-get-lftcol uidat) lnum)) (fulltestname (iup:attribute btn "TITLE")) (parts (string-split fulltestname "(")) |
︙ | ︙ | |||
2503 2504 2505 2506 2507 2508 2509 | sort-lb))) ) ;; insert extra widget here (if extra-widget extra-widget (iup:hbox)) ;; empty widget | < < < < | 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 | sort-lb))) ) ;; insert extra widget here (if extra-widget extra-widget (iup:hbox)) ;; empty widget ))) (let* ((status-toggles (map (lambda (status) (iup:toggle (conc status) #:fontsize 8 ;; btn-fontsz ;; "10" ;; #:expand "HORIZONTAL" #:action (lambda (obj val) |
︙ | ︙ | |||
3731 3732 3733 3734 3735 3736 3737 | tabdat (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%") (dboard:tabdat-numruns tabdat) (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%") ;; generate key patterns from the target stored in tabdat (let* ((dbkeys (dboard:tabdat-dbkeys tabdat))) (let ((fres (if (dboard:tabdat-target tabdat) | | | | | | | | | | | > | 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 | tabdat (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "runname" "%") (dboard:tabdat-numruns tabdat) (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) "test-name" "%/%") ;; generate key patterns from the target stored in tabdat (let* ((dbkeys (dboard:tabdat-dbkeys tabdat))) (let ((fres (if (dboard:tabdat-target tabdat) (let ((ptparts (append (dboard:tabdat-target tabdat)(make-list (length dbkeys) "%")))) (map (lambda (k v)(list k v)) dbkeys ptparts)) (let ((res '())) (for-each (lambda (key) (if (not (equal? key "runname")) (let ((val (hash-table-ref/default (dboard:tabdat-searchpatts tabdat) key #f))) (if val (set! res (cons (list key val) res)))))) dbkeys) res)))) fres))) #;(set! *updater-running* #f)) (define (dashboard:runs-tab-updater commondat tab-num) (debug:catch-and-dump (lambda () (let* ((tabdat (dboard:common-get-tabdat commondat tab-num: tab-num)) (dbkeys (dboard:tabdat-dbkeys tabdat))) (dashboard:do-update-rundat tabdat) |
︙ | ︙ | |||
3799 3800 3801 3802 3803 3804 3805 | commondat (lambda () (dashboard:runs-tab-updater commondat 1)) tab-num: 2) (iup:callback-set! *tim* "ACTION_CB" (lambda (time-obj) | | > | | | | | | | > | > | | | | 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 | commondat (lambda () (dashboard:runs-tab-updater commondat 1)) tab-num: 2) (iup:callback-set! *tim* "ACTION_CB" (lambda (time-obj) (if (not *updater-running*) (begin ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) ;; (set! update-is-running (dboard:commondat-updating commondat)) ;;(if (not update-is-running) ;; (dboard:commondat-updating-set! commondat #t)) ;;(mutex-unlock! (dboard:commondat-update-mutex commondat)) ;;(if (not update-is-running) ;; we know that the update was not running and we now have a lock on doing an update ;; (begin (set! *updater-running* #t) (dboard:common-run-curr-updaters commondat) ;; (dashboard:run-update commondat) (set! *updater-running* #f) ;; (mutex-lock! (dboard:commondat-update-mutex commondat)) ;; (dboard:commondat-updating-set! commondat #f) ;; (mutex-unlock! (dboard:commondat-update-mutex commondat))) )) 1)))) (let ((th1 (make-thread (lambda () (thread-sleep! 1) (dboard:common-run-curr-updaters commondat 0) ;; force update of summary tab ) "update buttons once")) |
︙ | ︙ |
Modified db.scm from [fb3a18f52f] to [2c7b396933].
︙ | ︙ | |||
1774 1775 1776 1777 1778 1779 1780 | (debug:print 0 *default-log-port* "ERROR: cannot read " infile) (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir) #f ) (with-input-from-file infile read-lines) ))) | > > | | | | > > > > > > > > > > > > > > > > | | 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 | (debug:print 0 *default-log-port* "ERROR: cannot read " infile) (debug:print 0 *default-log-port* "ERROR: run-dir is " run-dir) #f ) (with-input-from-file infile read-lines) ))) ;; check duration against test-run.dat file if it exists and update the value in ;; the db if necessary ;; (define (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration) (let* ((datf (conc run-dir ".mt_data/test-run.dat")) (modt (if (and (file-exists? datf) (file-read-access? datf)) (file-modification-time datf) #f)) ;; (+ event-time run-duration)))) (alt-run-duration (if modt (- modt event-time) #f))) (if (and alt-run-duration (> alt-run-duration run-duration)) (begin (debug:print 0 *default-log-port* "Test " test-id " run duration mismatch. Setting to " alt-run-duration) (db:with-db dbstruct #f #f (lambda (db) (sqlite3:execute db "UPDATE tests SET run_duration=? WHERE id=?;" alt-run-duration test-id) #t))) #f))) ;; #f = we did NOT adjust the time (define (db:find-and-mark-incomplete dbstruct run-id ovr-deadtime) (let* ((incompleted '()) (oldlaunched '()) (toplevels '()) ;; The default running-deadtime is 720 seconds = 12 minutes. ;; "(running-deadtime-default (+ server-start-allowance (* 2 launch-monitor-period)))" = 200 + (2 * (200 + 30 + 30)) (deadtime-trim (or ovr-deadtime (configf:lookup-number *configdat* "setup" "deadtime"))) |
︙ | ︙ | |||
1826 1827 1828 1829 1830 1831 1832 | ;; HOWEVER: this code in run:test seems to work fine ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) | > | | | | | | | | | | | | > | | | | | | | | | | | | 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 | ;; HOWEVER: this code in run:test seems to work fine ;; (> (- (current-seconds)(+ (db:test-get-event_time testdat) ;; (db:test-get-run_duration testdat))) ;; 600) ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)) (debug:print-info 0 *default-log-port* "Found old test in RUNNING state, test-id=" test-id" exceeded running-deadtime "running-deadtime" now="(current-seconds) " event-time="event-time" run-duration="run-duration))))) stmth1 run-id running-deadtime) ;; default time 720 seconds (sqlite3:for-each-row (lambda (test-id run-dir uname testname item-path event-time run-duration) (if (not (db:adjust-run-duration dbstruct test-id run-dir event-time run-duration)) (if (and (equal? uname "n/a") (equal? item-path "")) ;; this is a toplevel test ;; what to do with toplevel? call rollup? (begin (set! toplevels (cons (list test-id run-dir uname testname item-path run-id) toplevels)) (debug:print-info 0 *default-log-port* "Found old toplevel test in RUNNING state, test-id=" test-id)) (begin (debug:print-info 0 *default-log-port* "Found old test in REMOTEHOSTSTART state, test-id=" test-id " exceeded running-deadtime "running-deadtime" now="(current-seconds)" event-time="event-time " run-duration="run-duration) (set! incompleted (cons (list test-id run-dir uname testname item-path run-id) incompleted)))))) stmth2 run-id remotehoststart-deadtime) ;; default time 230 seconds ;; in LAUNCHED for more than one day. Could be long due to job queues TODO/BUG: Need override for this in config ;; ;; (db:delay-if-busy dbdat) (sqlite3:for-each-row |
︙ | ︙ | |||
3465 3466 3467 3468 3469 3470 3471 | (let* ((run-ids (db:get-all-run-ids mtdb))) (for-each (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) | | | 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 | (let* ((run-ids (db:get-all-run-ids mtdb))) (for-each (lambda (run-id) (let ((testrecs (db:get-all-tests-info-by-run-id mtdb run-id))) (db:prep-megatest.db-adj-test-ids (db:dbdat-get-db mtdb) run-id testrecs))) run-ids))) ;; Get test data using test_id, run-id is not used - but it will be! ;; (define (db:get-test-info-by-id dbstruct run-id test-id) (db:with-db dbstruct #f ;; run-id #f (lambda (db) |
︙ | ︙ |
Modified launch.scm from [6b92672067] to [3e2d417312].
︙ | ︙ | |||
203 204 205 206 207 208 209 | (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) | | | > > | 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | (round (- (current-seconds) start-seconds))))) (kill-tries 0)) ;; (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area) ;; (tests:set-full-meta-info test-id run-id (calc-minutes) work-area) (tests:set-full-meta-info #f test-id run-id (calc-minutes) work-area 10 update-db: #t) (let loop ((minutes (calc-minutes)) (cpu-load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (disk-free (get-df (current-directory))) (last-sync (current-seconds))) ;; (common:telemetry-log "zombie" (conc "launch:monitor-job - ;; top of loop encountered at "(current-seconds)" with ;; last-sync="last-sync)) (let* ((over-time (> (current-seconds) (+ last-sync update-period))) (new-cpu-load (let* ((load (alist-ref 'adj-core-load (common:get-normalized-cpu-load #f))) (delta (abs (- load cpu-load)))) (if (> delta 0.1) ;; don't bother updating with small changes load #f))) (new-disk-free (let* ((df (if over-time ;; only get df every 30 seconds |
︙ | ︙ | |||
231 232 233 234 235 236 237 | (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) | | | | < < < | < < | 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | (do-sync (or new-cpu-load new-disk-free over-time)) (test-info (rmt:get-test-info-by-id run-id test-id)) (state (db:test-get-state test-info)) (status (db:test-get-status test-info)) (kill-reason "no kill reason specified") (kill-job? #f)) #;(common:telemetry-log "zombie" (conc "launch:monitor-job - decision time encountered at "(current-seconds)" with last-sync="last-sync" do-sync="do-sync" over-time="over-time" update-period="update-period)) (cond ((test-get-kill-request run-id test-id) (set! kill-reason "KILLING TEST since received kill request (KILLREQ)") (set! kill-job? #t)) ((and runtlim (> (- (current-seconds) start-seconds) runtlim)) (set! kill-reason (conc "KILLING TEST DUE TO TIME LIMIT EXCEEDED! Runtime=" (- (current-seconds) start-seconds) " seconds, limit=" runtlim)) (set! kill-job? #t)) ((equal? status "DEAD") (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f update-db: #t) (rmt:set-state-status-and-roll-up-items run-id test-id 'foo "RUNNING" "n/a" "was marked dead; really still running.") ;;(set! kill-reason "KILLING TEST because it was marked as DEAD by launch:handle-zombie-tests (might indicate really overloaded server or else overzealous setup.deadtime)") ;; MARK RUNNING (set! kill-job? #f))) (debug:print 4 *default-log-port* "cpu: " new-cpu-load " disk: " new-disk-free " last-sync: " last-sync " do-sync: " do-sync) (launch:handle-zombie-tests run-id) (if do-sync ;; save meta data about the running of this test (tests:update-central-meta-info run-id test-id new-cpu-load new-disk-free (calc-minutes) #f #f)) (if kill-job? (begin (debug:print-info 0 *default-log-port* "proceeding to kill test: "kill-reason) (mutex-lock! m) ;; NOTE: The pid can change as different steps are run. Do we need handshaking between this ;; section and the runit section? Or add a loop that tries three times with a 1/4 second ;; between tries? |
︙ | ︙ | |||
310 311 312 313 314 315 316 | (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) | | | 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 | (begin (thread-sleep! 3) ;; (+ 3 (random 6))) ;; add some jitter to the call home time to spread out the db accesses (if (hash-table-ref/default misc-flags 'keep-going #f) ;; keep originals for cpu-load and disk-free unless they change more than the allowed delta (loop (calc-minutes) (or new-cpu-load cpu-load) (or new-disk-free disk-free) (if do-sync (current-seconds) last-sync))))))) (tests:update-central-meta-info run-id test-id (get-cpu-load) (get-df (current-directory))(calc-minutes) #f #f update-db: #t))) ;; NOTE: Checking twice for keep-going is intentional (define (launch:execute encoded-cmd) (let* ((cmdinfo (common:read-encoded-string encoded-cmd)) (tconfigreg #f)) (setenv "MT_CMDINFO" encoded-cmd) ;;(bb-check-path msg: "launch:execute incoming") |
︙ | ︙ | |||
463 464 465 466 467 468 469 470 471 472 473 474 475 476 | (db:test-get-host test-info) (begin (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running | > > > | 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 | (db:test-get-host test-info) (begin (debug:print 0 *default-log-port* "ERROR: failed to find a record for test-id " test-id ", exiting.") (exit)))) (test-pid (db:test-get-process_id test-info))) (cond ;; -mrw- I'm removing KILLREQ from this list so that a test in KILLREQ state is treated as a "do not run" flag. ;; ((or (member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun ;; (and (equal? (db:test-get-state test-info) "COMPLETED") ;; completed/abort => rerun if asked ;; (member (db:test-get-status test-info) '("ABORT")))) ((member (db:test-get-state test-info) '("INCOMPLETE" "KILLED" "UNKNOWN" "STUCK")) ;; prior run of this test didn't complete, go ahead and try to rerun (debug:print 0 *default-log-port* "INFO: test is INCOMPLETE or KILLED, treat this execute call as a rerun request") ;; (tests:test-force-state-status! run-id test-id "REMOTEHOSTSTART" "n/a") (rmt:general-call 'set-test-start-time #f test-id) (rmt:test-set-state-status run-id test-id "REMOTEHOSTSTART" "n/a" #f) ) ;; prime it for running |
︙ | ︙ | |||
769 770 771 772 773 774 775 | (item-path (vector-ref running-test 11))) (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) (define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) | > > | | 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 | (item-path (vector-ref running-test 11))) (debug:print 0 *default-log-port* "test " test-name "/" item-path " not completed") (if (not (null? tal)) (loop (car tal) (cdr tal))))))))))) (define (launch:is-test-alive host pid) (if (and host pid (not (equal? host "n/a"))) (let* ((is-local (equal? host (get-host-name))) (ssh-cmd (if is-local " " (conc "ssh " host " "))) (cmd (conc ssh-cmd "pstree -A " pid)) (output (with-input-from-pipe cmd read-lines))) (debug:print 2 *default-log-port* "Running " cmd " received " output) (if (eq? (length output) 0) #f #t)) #t)) |
︙ | ︙ |
Modified rmt.scm from [39d97c528a] to [29d7593e43].
︙ | ︙ | |||
20 21 22 23 24 25 26 27 28 29 30 31 32 33 | (use format typed-records) ;; RADT => purpose of json format?? (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") ;; (declare (uses rmtmod)) ;; (import rmtmod) ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;; | > > | 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | (use format typed-records) ;; RADT => purpose of json format?? (declare (unit rmt)) (declare (uses api)) (declare (uses http-transport)) (include "common_records.scm") (include "db_records.scm") ;; (declare (uses rmtmod)) ;; (import rmtmod) ;; ;; THESE ARE ALL CALLED ON THE CLIENT SIDE!!! ;; |
︙ | ︙ | |||
523 524 525 526 527 528 529 | ;; Just some syntatic sugar (define (rmt:register-test run-id test-name item-path) (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) | | | > > > > > > > > > | 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 | ;; Just some syntatic sugar (define (rmt:register-test run-id test-name item-path) (rmt:general-call 'register-test run-id run-id test-name item-path)) (define (rmt:get-test-id run-id testname item-path) (rmt:send-receive 'get-test-id run-id (list run-id testname item-path))) ;; run-id is NOT used - but it will be! ;; (define (rmt:get-test-info-by-id run-id test-id) (if (number? test-id) (let* ((testdat (rmt:send-receive 'get-test-info-by-id run-id (list run-id test-id))) (trundatf (conc (db:test-get-rundir testdat) "/.mt_data/test-run.dat"))) ;; now we can update a couple fields from the filesystem (if (and (db:test-get-rundir testdat) (file-exists? trundatf)) (let* ((duration (db:test-get-run_duration testdat)) (event-time (db:test-get-event_time testdat)) (last-touch (file-modification-time trundatf))) (db:test-set-run_duration! testdat (max duration (- last-touch event-time))))) testdat) (begin (debug:print 0 *default-log-port* "WARNING: Bad data handed to rmt:get-test-info-by-id run-id=" run-id ", test-id=" test-id) (print-call-chain (current-error-port)) #f))) (define (rmt:test-get-rundir-from-test-id run-id test-id) (rmt:send-receive 'test-get-rundir-from-test-id run-id (list run-id test-id))) |
︙ | ︙ |
Modified runs.scm from [030b929939] to [497a5137ad].
︙ | ︙ | |||
1787 1788 1789 1790 1791 1792 1793 | (debug:print-info 4 *default-log-port* "cond branch - " "rtq-4") (let ((can-run-more #f)) ;; (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (not can-run-more) #;(and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here (if loop-list (apply loop loop-list) | | < < | 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 | (debug:print-info 4 *default-log-port* "cond branch - " "rtq-4") (let ((can-run-more #f)) ;; (runs:can-run-more-tests runsdat run-id jobgroup max-concurrent-jobs))) (if (not can-run-more) #;(and (list? can-run-more) (car can-run-more)) (let ((loop-list (runs:expand-items hed tal reg reruns regfull newtal jobgroup max-concurrent-jobs run-id waitons item-path testmode test-record can-run-more items runname tconfig reglen test-registry test-records itemmaps))) ;; itemized test expanded here (if loop-list (apply loop loop-list) (debug:print-info 4 *default-log-port* " -- Can't expand hed="hed))) ;; if can't run more just loop with next possible test (loop (car newtal)(cdr newtal) reg reruns)))) ;; this case should not happen, added to help catch any bugs ((and (list? items) itemdat) (debug:print-info 4 *default-log-port* "cond branch - " "rtq-5") (debug:print-error 0 *default-log-port* "Should not have a list of items in a test and the itemspath set - please report this") |
︙ | ︙ |
Modified server.scm from [7b2af2dc7e] to [e17aad21de].
︙ | ︙ | |||
344 345 346 347 348 349 350 351 352 353 354 355 356 357 | (equal? server-key res)))) #t ;; (system (conc "touch " start-flag)) ;; lazy but safe (begin (debug:print-info 0 *default-log-port* "Gating server start, last start: " fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) (thread-sleep! reftime) (server:wait-for-server-start-last-flag areapath))))))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least 3 seconds old | > > > > > > > > > > > > > > > > > > | 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | (equal? server-key res)))) #t ;; (system (conc "touch " start-flag)) ;; lazy but safe (begin (debug:print-info 0 *default-log-port* "Gating server start, last start: " fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) (thread-sleep! reftime) (server:wait-for-server-start-last-flag areapath))))))) ;; wait for server=start-last to be three seconds old - this might be what was causing the obfusc-skel fails ;; #;(define (server:wait-for-server-start-last-flag areapath) (let* ((start-flag (conc areapath "/logs/server-start-last"))) (if (file-exists? start-flag) (let* ((fmodtime (file-modification-time start-flag)) (reftime (+ 2 (random 3))) (delta (- (current-seconds) fmodtime)) (all-go (> delta reftime))) (if all-go #t ;; (system (conc "touch " start-flag)) ;; lazy but safe (begin (debug:print-info 0 *default-log-port* "Gating server start, last start: " fmodtime ", delta: " delta ", reftime: " reftime ", all-go=" all-go) (thread-sleep! 5) (server:wait-for-server-start-last-flag areapath)))) #;(system (conc "touch " start-flag))))) ;; kind start up of servers, wait 40 seconds before allowing another server for a given ;; run-id to be launched ;; (define (server:kind-run areapath) ;; look for $MT_RUN_AREA_HOME/logs/server-start-last ;; and wait for it to be at least 3 seconds old |
︙ | ︙ |
Modified tests.scm from [0094b671e6] to [9198f1fbaa].
︙ | ︙ | |||
1942 1943 1944 1945 1946 1947 1948 | (lambda (count) (set! res count)) tdb "SELECT count(id) FROM test_rundat;") res)) 0) | > | > > > > > > > > > > > > > > > > > > > > | > > | | | | | | | > | | 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 | (lambda (count) (set! res count)) tdb "SELECT count(id) FROM test_rundat;") res)) 0) ;; (define (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname #!key (update-db #f)(tmpfree #f)) (if (get-environment-variable "MT_TEST_RUN_DIR") (let* ((dest-dir (conc (get-environment-variable "MT_TEST_RUN_DIR") "/.mt_data")) (or-dash (lambda (instr) (cond ((not instr) "") ;; #f -> blank, indicates value unchanged since last measurement taken ((string? instr)(if (string-search " " instr) (conc "\"" instr "\"") instr)) (else instr)))) (file-new (not (directory-exists? dest-dir)))) (if file-new (create-directory dest-dir #t)) (let* ((outp (open-output-file (conc dest-dir "/test-run.dat") #:append))) (with-output-to-port outp (lambda () (if file-new (print "epoch_time,run_id,test_id,cpuload,diskfree,tmpfree,run_minutes,hostname,uname")) (print (current-seconds) "," (or-dash run-id) "," (or-dash test-id) "," (or-dash cpuload) "," (or-dash diskfree) "," (or-dash tmpfree) "," (or-dash minutes) "," (or-dash hostname) "," (or-dash uname)))) ;; put uname last as it has spaces in it (close-output-port outp))) (begin (rmt:general-call 'update-test-rundat run-id test-id (current-seconds) (or cpuload -1)(or diskfree -1) -1 (or minutes -1)))) (if update-db (begin (if (and cpuload diskfree) (rmt:general-call 'update-cpuload-diskfree run-id cpuload diskfree test-id)) (if minutes (rmt:general-call 'update-run-duration run-id minutes test-id)) (if (and uname hostname) (rmt:general-call 'update-uname-host run-id uname hostname test-id))))) ;; This one is for running with no db access (i.e. via rmt: internally) (define (tests:set-full-meta-info db test-id run-id minutes work-area remtries #!key (update-db #f)) ;; (define (tests:set-full-meta-info test-id run-id minutes work-area) ;; (let ((remtries 10)) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (tmpfree (get-df "/tmp")) (uname (get-uname "-srvpio")) (hostname (get-host-name))) (tests:update-central-meta-info run-id test-id cpuload diskfree minutes uname hostname update-db: update-db tmpfree: tmpfree))) ;; (define (tests:set-partial-meta-info test-id run-id minutes work-area) #;(define (tests:set-partial-meta-info test-id run-id minutes work-area remtries) (let* ((cpuload (get-cpu-load)) (diskfree (get-df (current-directory))) (remtries 10)) (handle-exceptions |
︙ | ︙ | |||
1982 1983 1984 1985 1986 1987 1988 | (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up") (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* "exn=" (condition->list exn)) (debug:print 0 *default-log-port* " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (print-call-chain (current-error-port)))) | | < | 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 | (let ((err-status ((condition-property-accessor 'sqlite3 'status #f) exn))) (debug:print-error 0 *default-log-port* "tried for over a minute to update meta info and failed. Giving up") (debug:print 0 *default-log-port* "EXCEPTION: database probably overloaded or unreadable.") (debug:print 0 *default-log-port* " message: " ((condition-property-accessor 'exn 'message) exn)) (debug:print 5 *default-log-port* "exn=" (condition->list exn)) (debug:print 0 *default-log-port* " status: " ((condition-property-accessor 'sqlite3 'status) exn)) (print-call-chain (current-error-port)))) (tests:update-testdat-meta-info db test-id work-area cpuload diskfree minutes)))) ;;====================================================================== ;; A R C H I V I N G ;;====================================================================== (define (test:archive db test-id) #f) (define (test:archive-tests db keynames target) #f) |